diff options
author | Patrick Ohly <patrick.ohly@intel.com> | 2013-03-06 01:59:11 -0800 |
---|---|---|
committer | Patrick Ohly <patrick.ohly@intel.com> | 2013-03-06 02:07:07 -0800 |
commit | ef4f77e06aee18a9965c4e9e13100ca2783a94bd (patch) | |
tree | 37e2f49d862b76e663bf2428c1e8bce73d094ad8 | |
parent | 762d6c09107bb95f1941458e6f43e49efed1158f (diff) |
D-Bus testing: kill partially terminated processes
When running syncevo-dbus-server under valgrindcheck.sh,
the following happened occasionally:
- syncevo-dbus-server main thread quits, some threads keep running
=> ps shows the process as <defunct> with ppid = 1 = init
- valgrindcheck.sh notices that the process is done,
reports status and quits
- test-dbus.py fails to wait for the syncevo-dbus-server
process (because it is not the parent) and assumes that
the process is gone
At this point there is a lingering process which occupies the
well-known D-Bus name (= all further tests fail) and which
prevents unmounting the chroot.
It's unknown how the syncevo-dbus-server gets into that state.
Could be valgrind 3.7.0 or the kernel 3.4.28-2.20-xen.
As a workaround, let test-dbus.py collect the pids of all processed that it
couldn't wait for and send them SIGKILLs until that returns with "not
found".
-rwxr-xr-x | test/test-dbus.py | 36 |
1 files changed, 35 insertions, 1 deletions
diff --git a/test/test-dbus.py b/test/test-dbus.py index df22112b..c8aecd11 100755 --- a/test/test-dbus.py +++ b/test/test-dbus.py @@ -786,10 +786,27 @@ Use check=lambda: (expr1, expr2, ...) when more than one check is needed. logging.printf("found children: %s", children) return children + def killPending(self, pending): + '''Ensure that all processes listed with their pid are not running.''' + while True: + again = [] + for pid in pending: + logging.printf("sending SIGKILL to pending proccess %d" % pid) + if not TryKill(pid, signal.SIGKILL): + logging.printf("pending process %d is gone" % pid) + else: + again.append(pid) + if again: + time.sleep(0.1) + pending = again + else: + break + def killChildren(self, delay): '''Find all children of the current process and kill them. First send SIGTERM, then after a grace period SIGKILL.''' children = self.getChildren() + pending = [] # First pass with SIGTERM? if delay: for pid, (name, cmdline) in children.iteritems(): @@ -819,9 +836,24 @@ Use check=lambda: (expr1, expr2, ...) when more than one check is needed. del children[pid] except OSError, ex: if ex.errno == errno.ECHILD: - # someone else must have been faster, also okay + # Process might have transferred to init as parent: + # + # PID PPID TID NLWP CMD COMMAND + # 20617 1 - 2 [memcheck-amd64-] <defunct> [memcheck-amd64-] <defunct> + # - - 20617 - - - + # - - 21012 - - - + # + # This was observed for syncevo-dbus-server when running under + # valgrind despite apparently having terminated normally + # (valgrindcheck.sh sees the exist status, log output confirms + # normal shutdown). Not sure why process shutdown did not + # terminate all threads. + # + # Remember to kill the remaining threads with SIGKILL, + # as a workaround. logging.printf("process %d %s gone at %s", pid, name, time.asctime()) + pending.append(pid) del children[pid] else: raise ex @@ -829,6 +861,7 @@ Use check=lambda: (expr1, expr2, ...) when more than one check is needed. # All children quit normally. logging.printf("all process gone at %s", time.asctime()) + self.killPending(pending) return [] time.sleep(0.1) # Force killing of remaining children. It's still possible @@ -841,6 +874,7 @@ Use check=lambda: (expr1, expr2, ...) when more than one check is needed. if TryKill(pid, signal.SIGKILL): logging.printf("killed %d %s", pid, name) killed.append("%d %s" % (pid, name)) + self.killPending(pending) return killed def serverExecutableHelper(self, pid): |