[28828] in Source-Commits

home help back first fref pref prev next nref lref last post

moira commit [debian]: Remove delay after incr-runner spawns a child process, by calling waitpid() inside do_service() instead of polling a global variable set by a SIGCHLD handler after an interval. reapchild() still exists to reap child processes which timed out.

daemon@ATHENA.MIT.EDU (Anders Kaseorg)
Wed Apr 25 23:50:00 2018

Date: Wed, 25 Apr 2018 23:49:53 -0400
From: Anders Kaseorg <andersk@mit.edu>
Message-Id: <201804260349.w3Q3nr3o024782@drugstore.mit.edu>
To: source-commits@mit.edu

https://github.com/mit-athena/moira/commit/81678daba15f3c4efeaec01011762029c98a8826
commit 81678daba15f3c4efeaec01011762029c98a8826
Author: Greg Hudson <ghudson@mit.edu>
Date:   Mon Oct 17 11:23:21 2016 -0400

    Remove delay after incr-runner spawns a child process, by calling
    waitpid() inside do_service() instead of polling a global variable set
    by a SIGCHLD handler after an interval.  reapchild() still exists to
    reap child processes which timed out.

 moira/incremental/incr-runner/incr-runner.pc |   81 ++++++++++++--------------
 1 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/moira/incremental/incr-runner/incr-runner.pc b/moira/incremental/incr-runner/incr-runner.pc
index 6022bbe..6a48a28 100644
--- a/moira/incremental/incr-runner/incr-runner.pc
+++ b/moira/incremental/incr-runner/incr-runner.pc
@@ -29,12 +29,6 @@
 #define MAXARGC 20
 #define INC_TIMEOUT (3 * 60)        /* 3 minutes */
 
-int inc_pid = 0;
-int inc_running = 0;
-time_t inc_started;
-int child_exited_abnormally = 0;
-int child_pid, child_signal, child_status;
-
 EXEC SQL INCLUDE sqlca;
 void sqlglm(char *, unsigned int *, unsigned int *);
 void dbmserr(void);
@@ -42,6 +36,7 @@ void *xmalloc(size_t);
 void do_service(char *name);
 void free_argv(char **argv, int argc);
 void reapchild(int x);
+void nothing(int x);
 
 RCSID("$HeadURL$ $Id$");
 
@@ -153,7 +148,7 @@ void do_service(char *name)
   char *argv[MAXARGC * 2 + 4], prog[MAXPATHLEN], cbefore[3], cafter[3];
   int i, length = 0;
   EXEC SQL BEGIN DECLARE SECTION;
-  int beforec, afterc, incremental_id;
+  int beforec, afterc, incremental_id, status;
   char table[INCREMENTAL_QUEUE_TABLE_NAME_SIZE];
   char encoded_before[INCREMENTAL_QUEUE_BEFORE_SIZE];
   char encoded_after[INCREMENTAL_QUEUE_AFTER_SIZE];
@@ -161,14 +156,23 @@ void do_service(char *name)
   void *flattened_before = NULL, *flattened_after = NULL;
   char **before_argv = NULL, **after_argv = NULL;
   sigset_t sigs;
-  time_t now;
   struct sigaction action;
+  pid_t inc_pid, pid;
 
   EXEC SQL CONNECT :db IDENTIFIED BY :db;
 
   action.sa_flags = 0;
   sigemptyset(&action.sa_mask);
-  
+
+  /* Allow SIGALRM to interrupt waitpid(), but do nothing when it fires. */
+  action.sa_handler = nothing;
+  if (sigaction(SIGALRM, &action, NULL) < 0)
+    {
+      com_err(whoami, errno, "Unable to establish signal handlers.");
+      exit(1);
+    }
+
+  /* Set up a handler to reap timed-out incremental processes. */
   action.sa_handler = reapchild;
   sigaddset(&action.sa_mask, SIGCHLD);
   if (sigaction(SIGCHLD, &action, NULL) < 0)
@@ -186,21 +190,6 @@ void do_service(char *name)
 	  exit(1);
 	}
 
-      if (child_exited_abnormally)
-        {
-          critical_alert(whoami, "incr-runner", "%d: child exits with signal %d status %d",
-                         child_pid, child_signal, child_status);
-          child_exited_abnormally = 0;
-        }
-
-      time(&now);
-
-      if (inc_running && now - inc_started < INC_TIMEOUT)
-	goto sleep;
-
-      if (inc_running)
-	critical_alert(whoami, "incr-runner", "incremental timeout on pid %d", inc_pid);
-
       EXEC SQL SELECT table_name, beforec, afterc, before, after, incremental_id INTO
 	:table, :beforec, :afterc, :encoded_before, :encoded_after, :incremental_id
 	FROM incremental_queue WHERE service = :name AND
@@ -208,7 +197,10 @@ void do_service(char *name)
 
       /* No matching rows */
       if (sqlca.sqlerrd[2] == 0)
-	goto sleep;
+	{
+	  sleep(1);
+	  continue;
+	}
       
       sprintf(prog, "%s/%s.incr", BIN_DIR, name);
       argv[0] = prog;
@@ -270,8 +262,20 @@ void do_service(char *name)
 	  critical_alert(whoami, "incr-runner", "Failed to start incremental update %s", prog);
 	  break;
 	default:
-	  inc_running = 1;
-	  inc_started = now;
+	  break;
+	}
+
+      alarm(INC_TIMEOUT);
+      pid = waitpid(inc_pid, &status, 0);
+      alarm(0);
+      if (pid == -1 && errno == EINTR)
+	critical_alert(whoami, "incr-runner", "incremental timeout on pid %d", (int) inc_pid);
+      else if (pid == -1)
+	critical_alert(whoami, "incr-runner", "error in waitpid");
+      else if (WTERMSIG(status) != 0 || WEXITSTATUS(status) != 0)
+	{
+          critical_alert(whoami, "incr-runner", "%d: child exits with signal %d status %d",
+                         (int) inc_pid, WTERMSIG(status), WEXITSTATUS(status));
 	}
 
       sigprocmask(SIG_UNBLOCK, &sigs, NULL);
@@ -296,9 +300,6 @@ void do_service(char *name)
 	  free_argv(after_argv, afterc);
 	  after_argv = NULL;
 	}
-
-    sleep:
-      usleep(250000);
     }
 
   EXEC SQL COMMIT RELEASE;
@@ -339,18 +340,12 @@ void free_argv(char **argv, int argc)
 
 void reapchild(int x)
 {
-  int status, pid;
+  int status;
 
-  while ((pid = waitpid(-1, &status, WNOHANG)) > 0)
-    {
-      if (pid == inc_pid)
-        inc_running = 0;
-      if (WTERMSIG(status) != 0 || WEXITSTATUS(status) != 0)
-        {
-          child_exited_abnormally = 1;
-          child_pid = pid;
-          child_signal = WTERMSIG(status);
-          child_status = WEXITSTATUS(status);
-        }
-    }
+  while (waitpid(-1, &status, WNOHANG) > 0)
+    ;
+}
+
+void nothing(int x)
+{
 }

home help back first fref pref prev next nref lref last post