pacemaker 2.1.7-2.1.7
Scalable High-Availability cluster resource manager
Loading...
Searching...
No Matches
watchdog.c
Go to the documentation of this file.
1/*
2 * Copyright 2013-2023 the Pacemaker project contributors
3 *
4 * The version control history for this file may have further details.
5 *
6 * This source code is licensed under the GNU Lesser General Public License
7 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8 */
9
10#include <crm_internal.h>
11
12#include <sched.h>
13#include <sys/ioctl.h>
14#include <sys/reboot.h>
15
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <unistd.h>
19#include <ctype.h>
20#include <dirent.h>
21#include <signal.h>
22
23static pid_t sbd_pid = 0;
24
25static void
26sysrq_trigger(char t)
27{
28#if HAVE_LINUX_PROCFS
29 FILE *procf;
30
31 // Root can always write here, regardless of kernel.sysrq value
32 procf = fopen("/proc/sysrq-trigger", "a");
33 if (!procf) {
34 crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
35 return;
36 }
37 crm_info("sysrq-trigger: %c", t);
38 fprintf(procf, "%c\n", t);
39 fclose(procf);
40#endif // HAVE_LINUX_PROCFS
41 return;
42}
43
44
49static void
50panic_local(void)
51{
52 int rc = pcmk_ok;
53 uid_t uid = geteuid();
54 pid_t ppid = getppid();
55 const char *panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
56
57 if(uid != 0 && ppid > 1) {
58 /* We're a non-root pacemaker daemon (pacemaker-based,
59 * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
60 * the original pacemakerd parent.
61 *
62 * Of these, only the controller is likely to be initiating resets.
63 */
64 crm_emerg("Signaling parent %lld to panic", (long long) ppid);
66 return;
67
68 } else if (uid != 0) {
69#if HAVE_LINUX_PROCFS
70 /*
71 * No permissions, and no pacemakerd parent to escalate to.
72 * Track down the new pacemakerd process and send a signal instead.
73 */
74 union sigval signal_value;
75
76 memset(&signal_value, 0, sizeof(signal_value));
77 ppid = pcmk__procfs_pid_of("pacemakerd");
78 crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
79
80 if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
81 crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
82 (long long) ppid);
83 }
84#endif // HAVE_LINUX_PROCFS
85
86 /* The best we can do now is die */
88 return;
89 }
90
91 /* We're either pacemakerd, or a pacemaker daemon running as root */
92
93 if (pcmk__str_eq(panic_action, "crash", pcmk__str_casei)) {
94 sysrq_trigger('c');
95
96 } else if (pcmk__str_eq(panic_action, "sync-crash", pcmk__str_casei)) {
97 sync();
98 sysrq_trigger('c');
99
100 } else {
101 if (pcmk__str_eq(panic_action, "sync-reboot", pcmk__str_casei)) {
102 sync();
103 }
104 sysrq_trigger('b');
105 }
106 /* reboot(RB_HALT_SYSTEM); rc = errno; */
107 reboot(RB_AUTOBOOT);
108 rc = errno;
109
110 crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
111 (long long) ppid, pcmk_rc_str(rc), rc);
112
113 if(ppid > 1) {
114 /* child daemon */
115 exit(CRM_EX_PANIC);
116 } else {
117 /* pacemakerd or orphan child */
118 exit(CRM_EX_FATAL);
119 }
120}
121
126static void
127panic_sbd(void)
128{
129 union sigval signal_value;
130 pid_t ppid = getppid();
131
132 crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
133
134 memset(&signal_value, 0, sizeof(signal_value));
135 /* TODO: Arrange for a slightly less brutal option? */
136 if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
137 crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
138 (long long) sbd_pid);
139 panic_local();
140 }
141
142 if(ppid > 1) {
143 /* child daemon */
144 exit(CRM_EX_PANIC);
145 } else {
146 /* pacemakerd or orphan child */
147 exit(CRM_EX_FATAL);
148 }
149}
150
160void
161pcmk__panic(const char *origin)
162{
163 /* Ensure sbd_pid is set */
164 (void) pcmk__locate_sbd();
165
167 {
168 // getppid() == 1 means our original parent no longer exists
169 crm_emerg("Shutting down instead of panicking the node "
170 CRM_XS " origin=%s sbd=%lld parent=%d",
171 origin, (long long) sbd_pid, getppid());
173 return;
174 },
175 {}
176 );
177
178 if(sbd_pid > 1) {
179 crm_emerg("Signaling sbd[%lld] to panic the system: %s",
180 (long long) sbd_pid, origin);
181 panic_sbd();
182
183 } else {
184 crm_emerg("Panicking the system directly: %s", origin);
185 panic_local();
186 }
187}
188
193pid_t
195{
196 char *pidfile = NULL;
197 char *sbd_path = NULL;
198 int rc;
199
200 if(sbd_pid > 1) {
201 return sbd_pid;
202 }
203
204 /* Look for the pid file */
205 pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
206 sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
207
208 /* Read the pid file */
209 rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
210 if (rc == pcmk_rc_ok) {
211 crm_trace("SBD detected at pid %lld (via PID file %s)",
212 (long long) sbd_pid, pidfile);
213
214#if HAVE_LINUX_PROCFS
215 } else {
216 /* Fall back to /proc for systems that support it */
217 sbd_pid = pcmk__procfs_pid_of("sbd");
218 crm_trace("SBD detected at pid %lld (via procfs)",
219 (long long) sbd_pid);
220#endif // HAVE_LINUX_PROCFS
221 }
222
223 if(sbd_pid < 0) {
224 sbd_pid = 0;
225 crm_trace("SBD not detected");
226 }
227
228 free(pidfile);
229 free(sbd_path);
230
231 return sbd_pid;
232}
233
234long
236{
237 static long sbd_timeout = -2;
238
239 if (sbd_timeout == -2) {
240 sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
241 }
242 return sbd_timeout;
243}
244
245bool
247{
248 static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
249 static bool checked_sync_resource_startup = false;
250
251 if (!checked_sync_resource_startup) {
252 const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
253
254 if (sync_env == NULL) {
255 crm_trace("Defaulting to %sstart-up synchronization with sbd",
256 (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
257
258 } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
259 crm_warn("Defaulting to %sstart-up synchronization with sbd "
260 "because environment value '%s' is invalid",
261 (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
262 }
263 checked_sync_resource_startup = true;
264 }
265 return sync_resource_startup != 0;
266}
267
268long
270{
271 long sbd_timeout = pcmk__get_sbd_timeout();
272
273 return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
274}
275
276bool
277pcmk__valid_sbd_timeout(const char *value)
278{
279 long st_timeout = value? crm_get_msec(value) : 0;
280
281 if (st_timeout < 0) {
282 st_timeout = pcmk__auto_watchdog_timeout();
283 crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
284 st_timeout, value);
285 }
286
287 if (st_timeout == 0) {
288 crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
289 value? value : "default");
290
291 } else if (pcmk__locate_sbd() == 0) {
292 crm_emerg("Shutting down: stonith-watchdog-timeout configured (%s) "
293 "but SBD not active", (value? value : "auto"));
295 return false;
296
297 } else {
298 long sbd_timeout = pcmk__get_sbd_timeout();
299
300 if (st_timeout < sbd_timeout) {
301 crm_emerg("Shutting down: stonith-watchdog-timeout (%s) too short "
302 "(must be >%ldms)", value, sbd_timeout);
304 return false;
305 }
306 crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
307 value, sbd_timeout);
308 }
309 return true;
310}
int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid)
Definition pid.c:172
pid_t pcmk__procfs_pid_of(const char *name)
Definition procfs.c:111
long long crm_get_msec(const char *input)
Parse a time+units string and return milliseconds equivalent.
Definition strings.c:364
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
int crm_str_to_boolean(const char *s, int *ret)
Definition strings.c:424
#define SBIN_DIR
Definition config.h:574
#define PCMK__SBD_SYNC_DEFAULT
Definition config.h:568
#define PCMK_RUN_DIR
Definition config.h:541
#define crm_info(fmt, args...)
Definition logging.h:382
#define crm_warn(fmt, args...)
Definition logging.h:380
#define CRM_XS
Definition logging.h:56
#define crm_perror(level, fmt, args...)
Send a system error message to both the log and stderr.
Definition logging.h:323
#define crm_debug(fmt, args...)
Definition logging.h:384
#define crm_trace(fmt, args...)
Definition logging.h:385
#define crm_emerg(fmt, args...)
Definition logging.h:377
#define pcmk__if_tracing(if_action, else_action)
#define PCMK__ENV_PANIC_ACTION
const char * pcmk__env_option(const char *option)
Definition options.c:58
const char * pcmk_rc_str(int rc)
Get a user-friendly description of a return code.
Definition results.c:501
@ CRM_EX_PANIC
Panic the local host.
Definition results.h:272
@ CRM_EX_FATAL
Do not respawn.
Definition results.h:271
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition results.c:936
@ pcmk_rc_ok
Definition results.h:154
#define pcmk_ok
Definition results.h:68
@ pcmk__str_casei
pid_t pcmk__locate_sbd(void)
Definition watchdog.c:194
long pcmk__get_sbd_timeout(void)
Definition watchdog.c:235
bool pcmk__get_sbd_sync_resource_startup(void)
Definition watchdog.c:246
bool pcmk__valid_sbd_timeout(const char *value)
Definition watchdog.c:277
long pcmk__auto_watchdog_timeout(void)
Definition watchdog.c:269
void pcmk__panic(const char *origin)
Definition watchdog.c:161