Uploaded image for project: 'DC/OS'
  1. DC/OS
  2. DCOS_OSS-4732

test_metrics.test_prom_metrics_containers_app is flaky

    Details

    • Transition Due Date:

      Description

      https://teamcity.mesosphere.io/viewLog.html?buildId=1499792&buildTypeId=DcOs_Open_Test_IntegrationTest_AwsOnpremWStaticBackendGroup2

      Unable to find source-code formatter for language: python. Available languages are: actionscript, html, java, javascript, none, sql, xhtml, xml
      test_metrics.py:514 (test_prom_metrics_containers_app)
      dcos_api_session = <dcos_test_utils.dcos_api.DcosApiSession object at 0x7fdaf6a705f8>
      
          def test_prom_metrics_containers_app(dcos_api_session):
              """Assert that prometheus app metrics appear in the v0 metrics API."""
              task_name = 'test-prom-metrics-containers-app'
              metric_name_pfx = 'test_prom_metrics_containers_app'
              marathon_app = {
                  'id': '/' + task_name,
                  'instances': 1,
                  'cpus': 0.1,
                  'mem': 128,
                  'cmd': '\n'.join([
                      'echo "Creating metrics file..."',
                      'touch metrics',
          
                      'echo "# TYPE {}_gauge gauge" >> metrics'.format(metric_name_pfx),
                      'echo "{}_gauge 100" >> metrics'.format(metric_name_pfx),
          
                      'echo "# TYPE {}_count counter" >> metrics'.format(metric_name_pfx),
                      'echo "{}_count 2" >> metrics'.format(metric_name_pfx),
          
                      'echo "# TYPE {}_histogram histogram" >> metrics'.format(metric_name_pfx),
                      'echo "{}_histogram_bucket{{le=\\"+Inf\\"}} 4" >> metrics'.format(metric_name_pfx),
                      'echo "{}_histogram_sum 4" >> metrics'.format(metric_name_pfx),
                      'echo "{}_histogram_seconds_count 4" >> metrics'.format(metric_name_pfx),
          
                      'echo "Serving prometheus metrics on http://localhost:$PORT0"',
                      'python3 -m http.server $PORT0',
                  ]),
                  'container': {
                      'type': 'MESOS',
                      'docker': {'image': 'library/python:3'}
                  },
                  'portDefinitions': [{
                      'protocol': 'tcp',
                      'port': 0,
                      'labels': {'DCOS_METRICS_FORMAT': 'prometheus'},
                  }],
              }
          
              logging.debug('Starting marathon app with config: %s', marathon_app)
              expected_metrics = [
                  # metric_name, metric_value
                  ('_'.join([metric_name_pfx, 'gauge.gauge']), 100),
                  ('_'.join([metric_name_pfx, 'count.counter']), 2),
                  ('_'.join([metric_name_pfx, 'histogram_seconds', 'count']), 4),
              ]
          
      >       with dcos_api_session.marathon.deploy_and_cleanup(marathon_app, check_health=False):
      
      test_metrics.py:561: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      ../python--b993cadd48a5f4550cd6a41af2cbb9a9012bb2e9/lib/python3.6/contextlib.py:81: in __enter__
          return next(self.gen)
      ../../lib/python3.6/site-packages/dcos_test_utils/marathon.py:293: in deploy_and_cleanup
          app_definition, check_health, ignore_failed_tasks, timeout=timeout)
      ../../lib/python3.6/site-packages/dcos_test_utils/marathon.py:174: in deploy_app
          check_health, ignore_failed_tasks, timeout)
      ../../lib/python3.6/site-packages/dcos_test_utils/marathon.py:143: in wait_for_app_deployment
          wait()
      ../../lib/python3.6/site-packages/retrying.py:49: in wrapped_f
          return Retrying(*dargs, **dkw).call(f, *args, **kw)
      ../../lib/python3.6/site-packages/retrying.py:206: in call
          return attempt.get(self._wrap_exception)
      ../../lib/python3.6/site-packages/retrying.py:247: in get
          six.reraise(self.value[0], self.value[1], self.value[2])
      ../../lib/python3.6/site-packages/six.py:686: in reraise
          raise value
      ../../lib/python3.6/site-packages/retrying.py:200: in call
          attempt = Attempt(fn(*args, **kwargs), attempt_number, False)
      ../../lib/python3.6/site-packages/dcos_test_utils/marathon.py:142: in wait
          return self.check_app_instances(app_id, app_instances, check_health, ignore_failed_tasks)
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      
      self = <dcos_test_utils.marathon.Marathon object at 0x7fdaf6a2b6a0>
      app_id = '/test-prom-metrics-containers-app', app_instances = 1
      check_health = False, ignore_failed_tasks = False
      
          def check_app_instances(
                  self,
                  app_id: str,
                  app_instances: int,
                  check_health: bool,
                  ignore_failed_tasks: bool) -> bool:
              """ Check a marathon app ID and return True if healthy
          
                  Args:
                      app_id: marathon app ID ro be checked
                      app_instances: number of expected app instances
                      check_health: if True, health check status must pass to return True
                      ignore_failed_tasks: if False, any failed tasks will result in an exception
                  """
              # Some of the counters need to be explicitly enabled now and/or in
              # future versions of Marathon:
              req_params = (('embed', 'apps.lastTaskFailure'),
                            ('embed', 'apps.counts'))
          
              log.info('Waiting for application to be deployed...')
              r = self.get(path_join('/v2/apps', app_id), params=req_params)
              r.raise_for_status()
          
              data = r.json()
              log.debug('Current application state data: {}'.format(repr(data)))
          
              if 'lastTaskFailure' in data['app']:
                  message = data['app']['lastTaskFailure']['message']
                  if not ignore_failed_tasks:
      >               raise AssertionError('Application deployment failed, reason: {}'.format(message))
      E               AssertionError: Application deployment failed, reason: Failed to launch container: Collect failed: Failed to perform 'curl': curl: (28) Operation too slow. Less than 1 bytes/sec transferred the last 60 seconds
      

        Attachments

          Issue Links

            Activity

              People

              • Assignee:
                philip Philip Norman
                Reporter:
                tomaszjaniszewski Tomasz Janiszewski
                Team:
                Observability Team
                Watchers:
                Alex Rukletsov, Branden Rolston, Charles Provencher, Deepak Goel, Dima Rozhkov, Gastón Kleiman, Gauri Powale, Gilbert Song, Greg Mann, Gustav Paul, Jan-Philip Gehrcke, Julian Gieseke, Matthias Eichstedt, Mergebot, Orlando Hohmeier, Philip Norman, Senthil Kumaran, Sergey Urbanovich, Till Toenshoff, Tim Weidner, Tomasz Janiszewski
              • Watchers:
                21 Start watching this issue

                Dates

                • Created:
                  Updated:
                  Resolved: