Browse code

[tika] BM-14997 BM-13708 Feat: use systemd watchdog to auto-restart a crashed tika

Thomas Cataldo authored on 27/06/2019 11:01:19
Showing 4 changed files
... ...
@@ -15,6 +15,7 @@ RemainAfterExit=no
15 15
 SuccessExitStatus=129 143
16 16
 Environment=JAVA_HOME=/usr/lib/jvm/bm-jdk
17 17
 ExecStart=/usr/share/bm-tika/bm-tika /var/run/bm-tika.pid
18
+Restart=on-watchdog
18 19
 
19 20
 [Install]
20 21
 WantedBy=multi-user.target
... ...
@@ -1,6 +1,7 @@
1 1
 package net.bluemind.systemd.notify;
2 2
 
3 3
 import java.io.File;
4
+import java.util.concurrent.TimeUnit;
4 5
 
5 6
 import org.slf4j.Logger;
6 7
 import org.slf4j.LoggerFactory;
... ...
@@ -54,6 +55,28 @@ public class SystemD {
54 55
 			}
55 56
 		}
56 57
 
58
+		public void setupWatchdog(long period, TimeUnit unit) {
59
+			int pid = LIBC.getpid();
60
+			logger.info("Setup systemd watchdog for PID {}...", pid);
61
+			int errorCode = impl.sd_pid_notify(pid, 0, "WATCHDOG_USEC=" + unit.toMicros(period));
62
+			if (errorCode <= 0) {
63
+				logger.error("setupWatchdog failed: {}", errorCode);
64
+			} else {
65
+				logger.info("setupWatchdog OK for {}, errorCode: {}", pid, errorCode);
66
+			}
67
+		}
68
+
69
+		public void watchdogKeepalive() {
70
+			int pid = LIBC.getpid();
71
+			logger.info("keepAlive for PID {}...", pid);
72
+			int errorCode = impl.sd_pid_notify(pid, 0, "WATCHDOG=1");
73
+			if (errorCode <= 0) {
74
+				logger.error("keepAlive failed: {}", errorCode);
75
+			} else {
76
+				logger.debug("keepAlive for {}, errorCode: {}", pid, errorCode);
77
+			}
78
+		}
79
+
57 80
 	}
58 81
 
59 82
 	private interface RawApi extends Library {
... ...
@@ -18,6 +18,7 @@ import net.bluemind.lib.vertx.VertxPlatform;
18 18
 import net.bluemind.systemd.notify.SystemD;
19 19
 import net.bluemind.tika.server.impl.ExtractTextWorker;
20 20
 import net.bluemind.tika.server.impl.ReceiveDocumentVerticle;
21
+import net.bluemind.tika.server.impl.SystemdWatchdogVerticle;
21 22
 
22 23
 public class TikaServer implements IApplication {
23 24
 
... ...
@@ -62,13 +63,19 @@ public class TikaServer implements IApplication {
62 63
 			}
63 64
 		};
64 65
 
65
-		pm.deployVerticle(ReceiveDocumentVerticle.class.getCanonicalName(), null, new URL[0], 4, null, doneHandler);
66
+		pm.deployVerticle(ReceiveDocumentVerticle.class.getCanonicalName(), null, new URL[0], 32, null, doneHandler);
66 67
 
67 68
 		pm.deployWorkerVerticle(false, ExtractTextWorker.class.getCanonicalName(), null, new URL[0], 4, null,
68 69
 				doneHandler);
69 70
 		cdl.await(1, TimeUnit.MINUTES);
70 71
 		if (SystemD.isAvailable()) {
71 72
 			SystemD.get().notifyReady();
73
+			pm.deployVerticle(SystemdWatchdogVerticle.class.getCanonicalName(), null, new URL[0], 1, null, ar -> {
74
+				if (ar.failed()) {
75
+					logger.error("Watchdog setup failed", ar.cause());
76
+				}
77
+			});
78
+
72 79
 		}
73 80
 
74 81
 		return IApplication.EXIT_OK;
75 82
new file mode 100644
... ...
@@ -0,0 +1,35 @@
1
+package net.bluemind.tika.server.impl;
2
+
3
+import java.io.File;
4
+import java.util.concurrent.TimeUnit;
5
+
6
+import org.slf4j.Logger;
7
+import org.slf4j.LoggerFactory;
8
+import org.vertx.java.platform.Verticle;
9
+
10
+import net.bluemind.systemd.notify.SystemD;
11
+
12
+public final class SystemdWatchdogVerticle extends Verticle {
13
+
14
+	private static final Logger logger = LoggerFactory.getLogger(SystemdWatchdogVerticle.class);
15
+
16
+	public SystemdWatchdogVerticle() {
17
+		logger.info("created");
18
+	}
19
+
20
+	public void start() {
21
+		SystemD.get().setupWatchdog(20, TimeUnit.SECONDS);
22
+		vertx.setPeriodic(10, tid -> {
23
+			// to check our watchdog works as expected
24
+			File f = new File("/root/tika.trigger");
25
+			if (f.exists()) {
26
+				vertx.cancelTimer(tid);
27
+				f.delete();
28
+			} else {
29
+				SystemD.get().watchdogKeepalive();
30
+			}
31
+		});
32
+		logger.info("Watchdog started.");
33
+	}
34
+
35
+}