commit 261c549: [Project] Allow to kill workers that hang up

Vsevolod Stakhov vsevolod at highsecure.ru
Sat Sep 21 16:21:06 UTC 2019


Author: Vsevolod Stakhov
Date: 2019-09-21 16:27:25 +0100
URL: https://github.com/rspamd/rspamd/commit/261c54963d3e48834100180125a2a17e1759cb61 (HEAD -> master)

[Project] Allow to kill workers that hang up

---
 src/libserver/cfg_file.h    |  1 +
 src/libserver/cfg_rcl.c     |  7 +++++++
 src/libserver/worker_util.c | 25 +++++++++++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index 7186a73ec..d42fbfba9 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -380,6 +380,7 @@ struct rspamd_config {
 	gsize images_cache_size;                        /**< size of LRU cache for DCT data from images			*/
 	gdouble task_timeout;                           /**< maximum message processing time					*/
 	gint default_max_shots;                         /**< default maximum count of symbols hits permitted (-1 for unlimited) */
+	gint32 heartbeats_loss_max;                     /**< number of heartbeats lost to consider worker's termination */
 	gdouble heartbeat_interval;                     /**< interval for heartbeats for workers				*/
 
 	enum rspamd_log_type log_type;                  /**< log type											*/
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c
index 5a1d3a639..11c378d5d 100644
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -2188,6 +2188,13 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
 				G_STRUCT_OFFSET (struct rspamd_config, heartbeat_interval),
 				RSPAMD_CL_FLAG_TIME_FLOAT,
 				"Time between workers heartbeats");
+		rspamd_rcl_add_default_handler (sub,
+				"heartbeats_loss_max",
+				rspamd_rcl_parse_struct_integer,
+				G_STRUCT_OFFSET (struct rspamd_config, heartbeat_interval),
+				RSPAMD_CL_FLAG_INT_32,
+				"Maximum count of heartbeats to be lost before trying to "
+				"terminate a worker (default: 0 - disabled)");
 
 		/* Neighbours configuration */
 		rspamd_rcl_add_section_doc (&sub->subsections, "neighbours", "name",
diff --git a/src/libserver/worker_util.c b/src/libserver/worker_util.c
index d2e52d5a1..883e7e8a9 100644
--- a/src/libserver/worker_util.c
+++ b/src/libserver/worker_util.c
@@ -756,6 +756,31 @@ rspamd_main_heartbeat_cb (EV_P_ ev_timer *w, int revents)
 					g_quark_to_string (wrk->type),
 					wrk->pid,
 					timebuf);
+
+			if (rspamd_main->cfg->heartbeats_loss_max > 0 &&
+				-(wrk->hb.nbeats) >= rspamd_main->cfg->heartbeats_loss_max) {
+
+
+				if (-(wrk->hb.nbeats) >= rspamd_main->cfg->heartbeats_loss_max + 1) {
+					msg_err_main ("terminate worker type %s with pid %P, "
+								  "last beat on: %s; %L heartbeat loast",
+							g_quark_to_string (wrk->type),
+							wrk->pid,
+							timebuf,
+							-(wrk->hb.nbeats));
+					kill (wrk->pid, SIGTERM);
+				}
+				else {
+					msg_err_main ("force kill worker type %s with pid %P, "
+								  "last beat on: %s; %L heartbeat loast",
+							g_quark_to_string (wrk->type),
+							wrk->pid,
+							timebuf,
+							-(wrk->hb.nbeats));
+					kill (wrk->pid, SIGKILL);
+				}
+
+			}
 		}
 	}
 	else if (wrk->hb.nbeats < 0) {


More information about the Commits mailing list