tools: Fix idf_monitor test case occasional hangup or failure

author Roland Dobai <dobai.roland@gmail.com>

Tue, 6 Nov 2018 10:36:46 +0000 (11:36 +0100)

committer Roland Dobai <dobai.roland@gmail.com>

Fri, 23 Nov 2018 12:33:17 +0000 (13:33 +0100)
author Roland Dobai <dobai.roland@gmail.com>
Tue, 6 Nov 2018 10:36:46 +0000 (11:36 +0100)
committer Roland Dobai <dobai.roland@gmail.com>
Fri, 23 Nov 2018 12:33:17 +0000 (13:33 +0100)
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml

index 4bfea8f6471a0c2e62cd53ba2652e0fe21e4dd15..07ff7158e0addf79390252715a33be154de128d5 100644 (file)
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -547,7 +547,7 @@ test_idf_monitor:
      expire_in: 1 week
    script:
      - cd ${IDF_PATH}/tools/test_idf_monitor
-    - ${IDF_PATH}/tools/ci/multirun_with_pyenv.sh ./run_test_idf_monitor.py
+    - ./run_test_idf_monitor.py
  
  test_idf_size:
    <<: *host_test_template
diff --git a/tools/idf_monitor.py b/tools/idf_monitor.py

index f09e934a862f72b7481e2bfec9015322f9519e34..92ef71d96b1b233de57a12fae83df23a94a18ea3 100755 (executable)
--- a/tools/idf_monitor.py
+++ b/tools/idf_monitor.py
@@ -373,7 +373,7 @@ class Monitor(object):
                  else:
                      raise RuntimeError("Bad event data %r" % ((event_tag,data),))
          except SerialStopException:
-            pass
+            sys.stderr.write(ANSI_NORMAL + "Stopping condition has been received\n")
          finally:
              try:
                  self.console_reader.stop()
diff --git a/tools/test_idf_monitor/run_test_idf_monitor.py b/tools/test_idf_monitor/run_test_idf_monitor.py

index da59bea8657c5c5d27f77ec14fbc7b3d440e4407..748c379030525f7d07726b96b70ad362760705ee 100755 (executable)
--- a/tools/test_idf_monitor/run_test_idf_monitor.py
+++ b/tools/test_idf_monitor/run_test_idf_monitor.py
@@ -19,13 +19,16 @@ from __future__ import unicode_literals
  from builtins import object
  from io import open
  import os
-import signal
+import sys
  import time
  import subprocess
+import socket
  import pty
+import filecmp
+import threading
  
  test_list = (
-        # Add new tests here. All files should be placed in in_dir. Columns are:
+        # Add new tests here. All files should be placed in IN_DIR. Columns are:
          # Input file            Filter string                                               File with expected output
          ('in1.txt',             '',                                                         'in1f1.txt'),
          ('in1.txt',             '*:V',                                                      'in1f1.txt'),
@@ -35,89 +38,137 @@ test_list = (
          ('in2.txt',             'vfs',                                                      'in2f2.txt'),
          )
  
-in_dir = 'tests/'       # tests are in this directory
-out_dir = 'outputs/'    # test results are written to this directory (kept only for debugging purposes)
-socat_in = './socatfile'# temporary socat file (deleted after run)
-err_out = out_dir + 'monitor_error_output'
-elf_file = './dummy.elf'  # ELF file used for starting the monitor
-idf_monitor = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))
+IN_DIR = 'tests/'       # tests are in this directory
+OUT_DIR = 'outputs/'    # test results are written to this directory (kept only for debugging purposes)
+ERR_OUT = OUT_DIR + 'monitor_error_output'
+ELF_FILE = './dummy.elf'  # ELF file used for starting the monitor
+IDF_MONITOR = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))
  
-class SocatRunner(object):
-    """
-    Runs socat in the background for creating a socket.
-    """
+# connection related to communicating with idf_monitor through sockets
+HOST = 'localhost'
+# blocking socket operations are used with timeout:
+SOCKET_TIMEOUT = 30
+# idf_monitor is killed if it doesn't respond:
+MONITOR_TIMEOUT = 60
+# the test is restarted after failure (idf_monitor has to be killed):
+RETRIES_PER_TEST = 5
+
+def monitor_timeout(process):
+    if process.poll() is None:
+        # idf_monitor is still running
+        try:
+            process.kill()
+            print('\tidf_monitor was killed because it did not finish in {} seconds.'.format(MONITOR_TIMEOUT))
+        except OSError as e:
+            if e.errno == errno.ESRCH:
+                # ignores a possible race condition which can occur when the process exits between poll() and kill()
+                pass
+            else:
+                raise
+
+class TestRunner(object):
      def __enter__(self):
-        # Wait for a connection on port 2399 and then run "tail" which will send the file content to that port. Tail
-        # is used because it can start even when the file doesn't exists and remains running after the file has been
-        # processed. This way the idf_monitor can end the communication when it received the content. Using regular
-        # "cat" would invoke exception in idf_monitor.
-        # Note: "-c 1GB" option is used to force sending the whole file under the assumption that all testing files
-        # will be much smaller than 1G.
-        # Note: A temporary file socat_in is used in order to be able to start socat only once instead of for each test.
-        socat_cmd = ['socat',
-                '-U', # unidirectional pipe from file to port
-                'TCP4-LISTEN:2399,reuseaddr,fork',
-                'exec:"tail -c 1GB -F ' + socat_in + '"']
-        print(' '.join(socat_cmd))
-        self._socat_process = subprocess.Popen(socat_cmd, preexec_fn=os.setsid) # See __exit__ for os.setsid
+        self.serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.serversocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
+        self.serversocket.bind((HOST, 0))
+        self.port = self.serversocket.getsockname()[1]
+        self.serversocket.listen(5)
          return self
  
      def __exit__(self, type, value, traceback):
-        # self._socat_process.terminate() doesn't enough because each connection to the port starts a new socat and a
-        # tail processes
-        os.killpg(os.getpgid(self._socat_process.pid), signal.SIGTERM)
-        # Note: this terminates all the processes but makes the script UNIX-only
+        self.serversocket.shutdown(socket.SHUT_RDWR)
+        self.serversocket.close()
+        print('Socket was closed successfully')
  
-def cleanup():
-    try:
-        os.remove(socat_in)
-    except:
-        # ignore if the file doesn't exist
-        pass
+    def accept_connection(self):
+        """ returns a socket for sending the input for idf_monitor which must be closed before calling this again. """
+        (clientsocket, address) = self.serversocket.accept()
+        # exception will be thrown here if the idf_monitor didn't connect in time
+        clientsocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
+        return clientsocket
  
-def main():
-    start = time.time()
-    cleanup() # avoid sending old content
-    if not os.path.exists(out_dir):
-        os.mkdir(out_dir)
+def test_iteration(runner, test, startup_timeout):
+    print('\nRunning test on {} with filter "{}" and expecting {}'.format(test[0], test[1], test[2]))
      try:
-        with SocatRunner():
-            # Sleep is necessary to make sure that socat is already listening. Only one sleep is used per run (this is
-            # another reason while the temporary socat_in file is used instead of directly reading the test files).
-            time.sleep(1)
-            for t in test_list:
-                print('Running test on {} with filter "{}" and expecting {}'.format(t[0], t[1], t[2]))
-                with open(in_dir + t[0], "r", encoding='utf-8') as i_f, open(socat_in, "w", encoding='utf-8') as s_f:
-                    print('cat {} > {}'.format(i_f.name, s_f.name))
-                    for line in i_f:
-                        s_f.write(line)
-                    idf_exit_sequence = b'\x1d\n'
-                    print('echo "<exit>" >> {}'.format(s_f.name))
-                    s_f.write(idf_exit_sequence.decode())
-                monitor_cmd = [idf_monitor,
-                        '--port', 'socket://localhost:2399',
-                        '--print_filter', t[1],
-                        elf_file]
-                with open(out_dir + t[2], "w", encoding='utf-8') as o_f, open(err_out, "w", encoding='utf-8') as e_f:
-                    try:
-                        (master_fd, slave_fd) = pty.openpty()
-                        print(' '.join(monitor_cmd), end=' ')
-                        print(' > {} 2> {} < {}'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
-                        proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f,
-                                close_fds=True)
-                        proc.wait()
-                    finally:
-                        os.close(slave_fd)
-                        os.close(master_fd)
-                diff_cmd = ['diff', in_dir + t[2], out_dir + t[2]]
-                print(' '.join(diff_cmd))
-                subprocess.check_call(diff_cmd)
-                print('Test has passed')
+        with open(OUT_DIR + test[2], "w", encoding='utf-8') as o_f, open(ERR_OUT, "w", encoding='utf-8') as e_f:
+            monitor_cmd = [sys.executable,
+                    IDF_MONITOR, '--port', 'socket://{}:{}'.format(HOST, runner.port), '--print_filter', test[1], ELF_FILE]
+            (master_fd, slave_fd) = pty.openpty()
+            print('\t', ' '.join(monitor_cmd), sep='')
+            print('\tstdout="{}" stderr="{}" stdin="{}"'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
+            start = time.time()
+            # the server socket is alive so idf_monitor can start now
+            proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f, close_fds=True, bufsize=-1)
+            # - idf_monitor's stdin needs to be connected to some pseudo-tty in docker image even when it is not
+            #   used at all
+            # - setting bufsize is needed because the default value is different on Python 2 and 3
+            # - the default close_fds is also different on Python 2 and 3
+            monitor_watchdog = threading.Timer(MONITOR_TIMEOUT, monitor_timeout, [proc])
+            monitor_watchdog.start()
+            client = runner.accept_connection()
+            # The connection is ready but idf_monitor cannot yet receive data. This seems to happen on Ubuntu 16.04 LTS
+            # and is not related to the version of Python or pyserial. There seems to be no reliable way other than to
+            # do a sleep. The idf_monitor header in ERR_OUT could be checked on Python 2 but the file is not flushed
+            # on Python 3.
+            print('\tSleeping for {:.2f} seconds'.format(startup_timeout))
+            time.sleep(startup_timeout)
+            with open(IN_DIR + test[0], "r", encoding='utf-8') as f:
+                print('\tSending {} to the socket'.format(f.name))
+                for line in f:
+                    client.sendall(line.encode())
+            idf_exit_sequence = b'\x1d\n'
+            print('\tSending <exit> to the socket')
+            client.sendall(idf_exit_sequence)
+            ret = proc.wait()
+            end = time.time()
+            print('\tidf_monitor exited after {:.2f} seconds'.format(end - start))
+            if ret < 0:
+                raise RuntimeError('idf_monitor was terminated by signal {}'.format(-ret))
+            # idf_monitor needs to end before the socket is closed in order to exit without an exception.
      finally:
-        cleanup()
+        if monitor_watchdog:
+            monitor_watchdog.cancel()
+        os.close(slave_fd)
+        os.close(master_fd)
+        if client:
+            client.close()
+        print('\tThe client was closed successfully')
+    f1 = IN_DIR + test[2]
+    f2 = OUT_DIR + test[2]
+    print('\tdiff {} {}'.format(f1, f2))
+    if filecmp.cmp(f1, f2, shallow=False):
+        print('\tTest has passed')
+    else:
+        raise RuntimeError("The contents of the files are different. Please examine the artifacts.")
+
+def main():
+    gstart = time.time()
+    if not os.path.exists(OUT_DIR):
+        os.mkdir(OUT_DIR)
+
+    socket.setdefaulttimeout(SOCKET_TIMEOUT)
+
+    for test in test_list:
+        startup_timeout = 1.0
+        for i in range(RETRIES_PER_TEST):
+            with TestRunner() as runner:
+                # Each test (and each retry) is run with a different port (and server socket). This is done for
+                # the CI run where retry with a different socket is necessary to pass the test. According to the
+                # experiments, retry with the same port (and server socket) is not sufficient.
+                try:
+                    test_iteration(runner, test, startup_timeout)
+                    # no more retries if test_iteration exited without an exception
+                    break
+                except Exception as e:
+                    if i < RETRIES_PER_TEST - 1:
+                        print('Test has failed with exception:', e)
+                        print('Another attempt will be made.')
+                        startup_timeout += 0.5
+                    else:
+                        raise
  
-    end = time.time()
-    print('Execution took {:.2f} seconds'.format(end - start))
+    gend = time.time()
+    print('Execution took {:.2f} seconds\n'.format(gend - gstart))
  
  if __name__ == "__main__":
      main()
author	Roland Dobai <dobai.roland@gmail.com>
	Tue, 6 Nov 2018 10:36:46 +0000 (11:36 +0100)
committer	Roland Dobai <dobai.roland@gmail.com>
	Fri, 23 Nov 2018 12:33:17 +0000 (13:33 +0100)
.gitlab-ci.yml		patch \| blob \| history
tools/idf_monitor.py		patch \| blob \| history
tools/test_idf_monitor/run_test_idf_monitor.py		patch \| blob \| history