Uploaded image for project: 'DC/OS'
  1. DC/OS
  2. DCOS_OSS-4922

test_rexray.test_move_external_volume_to_new_agent fails on master

    Details

    • Sprint:
      Storage: RI-12 Sprint 43, Storage: RI-13 Sprint 44
    • Story Points:
      2
    • Transition Due Date:

      Description

      Since DCOS-21559 recently re-enabled that test, we see the following;

      open_source_tests/test_rexray.py:12 (test_move_external_volume_to_new_agent)
      self = <dcos_test_utils.marathon.Marathon object at 0x7f152199de48>
      app_definition = {'cmd': '[ $(ls -A volume/ | grep -v --line-regexp "lost+found" | wc -l) -eq 0 ] && echo "72d4da1b46554bac8e8eb110e0b6...b110e0b67d5b', 'options': {'dvdi/driver': 'rexray'}, 'provider': 'dvdi', 'size': 1}, 'mode': 'RW'}]}, 'cpus': 0.1, ...}
      check_health = False, ignore_failed_tasks = True, timeout = 600
      
      def deploy_app(self, app_definition, check_health=True, ignore_failed_tasks=False, timeout=1200):
      """Deploy an app to marathon
      
      This function deploys an an application and then waits for marathon to
      acknowledge it's successful creation or fails the test.
      
      The wait for application is immediately aborted if Marathon returns
      nonempty 'lastTaskFailure' field. Otherwise it waits until all the
      instances reach tasksRunning and then tasksHealthy state.
      
      Args:
      app_definition: a dict with application definition as specified in
      Marathon API (https://mesosphere.github.io/marathon/docs/rest-api.html#post-v2-apps)
      check_health: wait until Marathon reports tasks as healthy before
      returning
      
      Returns:
      A list of named tuples which represent service points of deployed
      applications. I.E:
      [Endpoint(host='172.17.10.202', port=10464), Endpoint(host='172.17.10.201', port=1630)]
      """
      r = self.post('/v2/apps', json=app_definition)
      log.info('Response from marathon: {}'.format(repr(r.json())))
      r.raise_for_status()
      
      try:
      return self.wait_for_app_deployment(
      app_definition['id'],
      app_definition['instances'],
      > check_health, ignore_failed_tasks, timeout)
      
      ../../lib/python3.6/site-packages/dcos_test_utils/marathon.py:174: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      
      self = <dcos_test_utils.marathon.Marathon object at 0x7f152199de48>
      app_id = '/integration-test-move-external-volume-72d4da1b46554bac8e8eb110e0b67d5b/write'
      app_instances = 1, check_health = False, ignore_failed_tasks = True
      timeout = 600
      
      def wait_for_app_deployment(
      self,
      app_id: str,
      app_instances: int,
      check_health: bool,
      ignore_failed_tasks: bool,
      timeout: int):
      """ Retries the check_app_instance function for a limited time
      Args:
      app_id: ID of the marathon app to check
      app_instances: expected number of instances
      check_health: if True, health checks must pass before unblocking
      ignore_failed_tasks: if False, then failed tasks will raise an exception
      timeout: time (in seconds) to wait before raising an exception
      """
      
      @retrying.retry(
      wait_fixed=5000,
      stop_max_delay=timeout*1000,
      retry_on_result=lambda res: res is False,
      retry_on_exception=lambda ex: False)
      def wait():
      return self.check_app_instances(app_id, app_instances, check_health, ignore_failed_tasks)
      > wait()
      
      ../../lib/python3.6/site-packages/dcos_test_utils/marathon.py:143: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      
      args = (), kw = {}
      
      @six.wraps(f)
      def wrapped_f(*args, **kw):
      > return Retrying(*dargs, **dkw).call(f, *args, **kw)
      
      ../../lib/python3.6/site-packages/retrying.py:49: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      
      self = <retrying.Retrying object at 0x7f1521d17d68>
      fn = <function Marathon.wait_for_app_deployment.<locals>.wait at 0x7f1520907b70>
      args = (), kwargs = {}, start_time = 1552609902269, attempt_number = 120
      attempt = Attempts: 120, Value: False, delay_since_first_attempt_ms = 600935
      sleep = 5000
      
      def call(self, fn, *args, **kwargs):
      start_time = int(round(time.time() * 1000))
      attempt_number = 1
      while True:
      try:
      attempt = Attempt(fn(*args, **kwargs), attempt_number, False)
      except:
      tb = sys.exc_info()
      attempt = Attempt(tb, attempt_number, True)
      
      if not self.should_reject(attempt):
      return attempt.get(self._wrap_exception)
      
      delay_since_first_attempt_ms = int(round(time.time() * 1000)) - start_time
      if self.stop(attempt_number, delay_since_first_attempt_ms):
      if not self._wrap_exception and attempt.has_exception:
      # get() on an attempt with an exception should cause it to be raised, but raise just in case
      raise attempt.get()
      else:
      > raise RetryError(attempt)
      E retrying.RetryError: RetryError[Attempts: 120, Value: False]
      
      ../../lib/python3.6/site-packages/retrying.py:214: RetryError
      
      During handling of the above exception, another exception occurred:
      
      dcos_api_session = <dcos_test_utils.enterprise.EnterpriseApiSession object at 0x7f1521d17470>
      
      @pytest.mark.supportedwindows
      def test_move_external_volume_to_new_agent(dcos_api_session):
      """Test that an external volume is successfully attached to a new agent.
      
      If the dcos_api_session has only one agent, the volume will be detached and
      reattached to the same agent.
      
      """
      expanded_config = get_expanded_config()
      if not (expanded_config['provider'] == 'aws' or expanded_config['platform'] == 'aws'):
      pytest.skip('Must be run in an AWS environment!')
      
      hosts = dcos_api_session.slaves[0], dcos_api_session.slaves[-1]
      test_uuid = uuid.uuid4().hex
      test_label = 'integration-test-move-external-volume-{}'.format(test_uuid)
      mesos_volume_path = 'volume'
      docker_volume_path = '/volume'
      base_app = {
      'mem': 32,
      'cpus': 0.1,
      'instances': 1,
      'container': {
      'volumes': [{
      'mode': 'RW',
      'external': {
      'name': test_label,
      'provider': 'dvdi',
      'options': {'dvdi/driver': 'rexray'}
      }
      }]
      }
      }
      
      write_app = copy.deepcopy(base_app)
      write_app.update({
      'id': '/{}/write'.format(test_label),
      'cmd': (
      # Check that the volume is empty.
      '[ $(ls -A {volume_path}/ | grep -v --line-regexp "lost+found" | wc -l) -eq 0 ] && '
      # Write the test UUID to a file.
      'echo "{test_uuid}" >> {volume_path}/test && '
      'while true; do sleep 1000; done'
      ).format(test_uuid=test_uuid, volume_path=mesos_volume_path),
      'constraints': [['hostname', 'LIKE', hosts[0]]],
      })
      write_app['container']['type'] = 'MESOS'
      write_app['container']['volumes'][0]['containerPath'] = mesos_volume_path
      write_app['container']['volumes'][0]['external']['size'] = 1
      
      read_app = copy.deepcopy(base_app)
      read_app.update({
      'id': '/{}/read'.format(test_label),
      'cmd': (
      # Diff the file and the UUID.
      'echo "{test_uuid}" | diff - {volume_path}/test && '
      'while true; do sleep 1000; done'
      ).format(test_uuid=test_uuid, volume_path=docker_volume_path),
      'constraints': [['hostname', 'LIKE', hosts[1]]],
      })
      read_app['container'].update({
      'type': 'DOCKER',
      'docker': {
      'image': 'busybox',
      'network': 'HOST',
      }
      })
      read_app['container']['volumes'][0]['containerPath'] = docker_volume_path
      
      # Volume operations can take several minutes.
      timeout = 600
      
      deploy_kwargs = {
      'check_health': False,
      # A volume might fail to attach because EC2. We can tolerate that and retry.
      'ignore_failed_tasks': True,
      'timeout': timeout
      }
      
      try:
      > with dcos_api_session.marathon.deploy_and_cleanup(write_app, **deploy_kwargs):
      
      open_source_tests/test_rexray.py:92: 
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      ../python--8f6820fa4343543b43cc1ad316e5bc7b89f11e34/lib/python3.6/contextlib.py:81: in __enter__
      return next(self.gen)
      ../../lib/python3.6/site-packages/dcos_test_utils/marathon.py:293: in deploy_and_cleanup
      app_definition, check_health, ignore_failed_tasks, timeout=timeout)
      _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
      
      self = <dcos_test_utils.marathon.Marathon object at 0x7f152199de48>
      app_definition = {'cmd': '[ $(ls -A volume/ | grep -v --line-regexp "lost+found" | wc -l) -eq 0 ] && echo "72d4da1b46554bac8e8eb110e0b6...b110e0b67d5b', 'options': {'dvdi/driver': 'rexray'}, 'provider': 'dvdi', 'size': 1}, 'mode': 'RW'}]}, 'cpus': 0.1, ...}
      check_health = False, ignore_failed_tasks = True, timeout = 600
      
      def deploy_app(self, app_definition, check_health=True, ignore_failed_tasks=False, timeout=1200):
      """Deploy an app to marathon
      
      This function deploys an an application and then waits for marathon to
      acknowledge it's successful creation or fails the test.
      
      The wait for application is immediately aborted if Marathon returns
      nonempty 'lastTaskFailure' field. Otherwise it waits until all the
      instances reach tasksRunning and then tasksHealthy state.
      
      Args:
      app_definition: a dict with application definition as specified in
      Marathon API (https://mesosphere.github.io/marathon/docs/rest-api.html#post-v2-apps)
      check_health: wait until Marathon reports tasks as healthy before
      returning
      
      Returns:
      A list of named tuples which represent service points of deployed
      applications. I.E:
      [Endpoint(host='172.17.10.202', port=10464), Endpoint(host='172.17.10.201', port=1630)]
      """
      r = self.post('/v2/apps', json=app_definition)
      log.info('Response from marathon: {}'.format(repr(r.json())))
      r.raise_for_status()
      
      try:
      return self.wait_for_app_deployment(
      app_definition['id'],
      app_definition['instances'],
      check_health, ignore_failed_tasks, timeout)
      except retrying.RetryError:
      raise Exception("Application deployment failed - operation was not "
      > "completed in {} seconds.".format(timeout))
      E Exception: Application deployment failed - operation was not completed in 600 seconds.

      from https://teamcity.mesosphere.io/viewLog.html?buildId=1696088&buildTypeId=DcOs_Enterprise_Test_Inte_AwsOnpremWStaticBackendAndSecurityStrictGroup2

        Attachments

          Activity

            People

            • Assignee:
              gpaul Gustav Paul
              Reporter:
              till Till Toenshoff
              Team:
              Storage Team
              Watchers:
              Adam Dangoor, Alex Rukletsov (Inactive), Branden Rolston, Charles Provencher, Deepak Goel, Gastón Kleiman, Gauri Powale, Gilbert Song, Greg Mann, Gustav Paul, Jan-Philip Gehrcke, Jonathan Giddy, Julian Gieseke, Matthias Eichstedt, Mergebot, Orlando Hohmeier, Philip Norman, Senthil Kumaran, Sergey Urbanovich, Till Toenshoff, Tim Weidner
            • Watchers:
              21 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved: