From 5275723971332e3fdf88c83ce37ee3592d4905a5 Mon Sep 17 00:00:00 2001 From: Thibault Godouet Date: Sun, 15 Jun 2014 21:26:11 +0100 Subject: [PATCH] Improve support for hardware suspend (to memory or to disk) --- Makefile.in | 5 +- conf.c | 175 ++---------------------------- configure.in | 4 + database.c | 212 ++++++++++++++++++++++++++++++++++++- database.h | 6 ++ doc/en/faq.sgml | 51 ++++++--- doc/en/fcron.8.sgml | 12 +++ doc/en/fcron.conf.5.sgml | 10 +- doc/en/fcrontab.5.sgml | 11 +- doc/en/todo.sgml | 8 +- doc/fcron-doc.mod.in | 3 + fcron.c | 176 +++++++++++++++++++++++++++++- fcronconf.c | 7 ++ fcronconf.h | 1 + files/fcron.conf.in | 3 +- script/.gitignore | 1 + script/boot-install | 51 ++++++++- script/fcron.suspend.sh.in | 53 ++++++++++ 18 files changed, 590 insertions(+), 199 deletions(-) create mode 100644 script/fcron.suspend.sh.in diff --git a/Makefile.in b/Makefile.in index c201904..3d0e902 100644 --- a/Makefile.in +++ b/Makefile.in @@ -30,6 +30,7 @@ PIDDIR = @PIDDIR@ FIFODIR = @FIFODIR@ PIDFILE = @PIDFILE@ REBOOT_LOCK = @REBOOT_LOCK@ +SUSPEND_FILE = @SUSPEND_FILE@ FIFOFILE = @FIFOFILE@ FCRON_SHELL = @FCRON_SHELL@ SENDMAIL = @SENDMAIL@ @@ -120,7 +121,7 @@ exe_list_test: exe_list.o u_list.o exe_list_test.o log.o subs.o %.o: $(SRCDIR)/%.c $(HEADERSALL) $(SRCDIR)/%.h $(CC) $(CFLAGS) -DPIDFILE="\"${PIDFILE}\"" -DREBOOT_LOCK="\"${REBOOT_LOCK}\"" \ - -DFIFOFILE="\"${FIFOFILE}\"" -DETC="\"${ETC}\"" \ + -DSUSPEND_FILE="\"${SUSPEND_FILE}\"" -DFIFOFILE="\"${FIFOFILE}\"" -DETC="\"${ETC}\"" \ -DFCRON_SHELL="\"${FCRON_SHELL}\"" -DFCRON_CONF="\"${FCRON_CONF}\"" \ -DFCRONTABS="\"${FCRONTABS}\"" \ -DFCRON_ALLOW="\"${FCRON_ALLOW}\"" -DFCRON_DENY="\"${FCRON_DENY}\"" \ @@ -130,7 +131,7 @@ exe_list_test: exe_list.o u_list.o exe_list_test.o log.o subs.o initscripts: @(if test ! -d script; then mkdir script ; fi ; \ - for F in sysVinit-launcher fcron.sh fcron.init.suse fcron.init.systemd ; do \ + for F in sysVinit-launcher fcron.sh fcron.init.suse fcron.init.systemd fcron.suspend.sh ; do \ $(SRCDIR)/script/gen-in.pl $(SRCDIR)/script/$${F}.in script/$${F} ./ ; \ done) diff --git a/conf.c b/conf.c index 5f808d1..6ea88ea 100644 --- a/conf.c +++ b/conf.c @@ -810,7 +810,7 @@ add_line_to_file(cl_t * cl, cf_t * cf, uid_t runas, char *runas_str, time_t t_save, int is_system_startup) /* check if the line is valid, and if yes, add it to the file cf */ { - time_t slept = now - t_save; + time_t sleep_duration = now - t_save; if (cl->cl_shell == NULL || cl->cl_runas == NULL || cl->cl_mailto == NULL) { error("Line is not valid (empty shell, runas or mailto field)" @@ -839,173 +839,16 @@ add_line_to_file(cl_t * cl, cf_t * cf, uid_t runas, char *runas_str, Set(cl->cl_mailto, cl->cl_file->cf_user); } - /* job has been stopped during execution: insert it in lavg or serial queue - * if it was in one at fcron's stops. */ - /* NOTE: runatreboot is prioritary over jobs that were still running - * when fcron stops, because the former will get run quicker as they are not - * put into the serial queue. runatreboot jobs will be handled later on. */ - if (cl->cl_numexe > 0 && !is_runatreboot(cl->cl_option)) { - - cl->cl_numexe = 0; - if (is_lavg(cl->cl_option)) { - if (!is_strict(cl->cl_option)) - add_lavg_job(cl, -1); - } - else if (is_serial(cl->cl_option) - || is_serial_once(cl->cl_option)) - add_serial_job(cl, -1); - else { - /* job has been stopped during execution : - * launch it again */ - warn("job '%s' did not finish : running it again.", cl->cl_shell); - set_serial_once(cl->cl_option); - add_serial_job(cl, -1); - } - } - - if (is_system_startup || is_volatile(cl->cl_option)) { - clear_hasrun(cl->cl_option); - } - - if (is_runonce(cl->cl_option) && is_hasrun(cl->cl_option)) { - /* if we get here, then is_system_startup is_volatile are both false */ - /* do nothing: don't re-schedule or add to the job queue */ - explain("job '%s' with runonce set has already run since last " - "system startup: not re-scheduling.", cl->cl_shell); + /* make sure the timefreq is valid on @-lines or we could end up with + * infinite loops */ + if (!is_td(cl->cl_option) && cl->cl_timefreq < 10) { + error("Invalid timefreq %ld for job '%s': setting to 1 day", + cl->cl_timefreq, cl->cl_shell); + cl->cl_timefreq = 3600 * 24; } - else if (is_td(cl->cl_option)) { - - /* set the time and date of the next execution */ - if (is_system_startup && is_runatreboot(cl->cl_option)) { - - if (is_notice_notrun(cl->cl_option)) { - - if (cl->cl_runfreq == 1) { - /* %-line */ - set_next_exe_notrun(cl, SYSDOWN_RUNATREBOOT); - } - else { - /* set next exe and mail user */ - time_t since = cl->cl_nextexe; - - cl->cl_nextexe = now; - mail_notrun_time_t(cl, SYSDOWN, since); - } - - } - else { - cl->cl_nextexe = now; - } - - insert_nextexe(cl); - - } - else if (cl->cl_nextexe <= now) { - if (cl->cl_nextexe == 0) - /* the is a line from a new file */ - set_next_exe(cl, NO_GOTO, -1); - else if (cl->cl_runfreq == 1 && is_notice_notrun(cl->cl_option)) - set_next_exe_notrun(cl, SYSDOWN); - else if (is_bootrun(cl->cl_option) && t_save != 0 - && cl->cl_runfreq != 1) { - if (cl->cl_remain > 0 && --cl->cl_remain > 0) { - debug(" cl_remain: %d", cl->cl_remain); - } - else { - /* run bootrun jobs */ - cl->cl_remain = cl->cl_runfreq; - debug(" boot-run '%s'", cl->cl_shell); - if (!is_lavg(cl->cl_option)) { - set_serial_once(cl->cl_option); - add_serial_job(cl, -1); - } - else - add_lavg_job(cl, -1); - } - set_next_exe(cl, STD, -1); - } - else { - if (is_notice_notrun(cl->cl_option)) { - /* set next exe and mail user */ - time_t since = cl->cl_nextexe; - - set_next_exe(cl, NO_GOTO, -1); - mail_notrun_time_t(cl, SYSDOWN, since); - } - else - set_next_exe(cl, NO_GOTO, -1); - } - } - else { - /* value of nextexe is valid : just insert line in queue */ - insert_nextexe(cl); - } - } - else { /* is_td(cl->cl_option) */ - if (cl->cl_timefreq < 10) { - error("Invalid timefreq %ld for job '%s': setting to 1 day", - cl->cl_timefreq, cl->cl_shell); - cl->cl_timefreq = 3600 * 24; - } - - /* standard @-lines */ - if (is_system_startup && is_runatreboot(cl->cl_option)) { - cl->cl_nextexe = now; - } - /* t_save == 0 means this is a new file, hence a new line */ - else if (t_save == 0 || is_volatile(cl->cl_option) - || (is_system_startup && (is_rebootreset(cl->cl_option) - || is_runonce(cl->cl_option)))) { - /* cl_first is always saved to disk for a volatile line */ - if (cl->cl_first == LONG_MAX) { - cl->cl_nextexe = TIME_T_MAX; - } - else { - cl->cl_nextexe = now + cl->cl_first; - if (cl->cl_nextexe < now || cl->cl_nextexe > TIME_T_MAX) { - /* there was an integer overflow! */ - error - ("Error while setting next exe time for job '%s': cl_nextexe" - " overflowed (case1). now=%lu, cl_timefreq=%lu, cl_nextexe=%lu.", - cl->cl_shell, now, cl->cl_timefreq, cl->cl_nextexe); - error - ("Setting cl_nextexe to TIME_T_MAX to prevent an infinite loop."); - cl->cl_nextexe = TIME_T_MAX; - } - } - } - else { - if (cl->cl_nextexe != LONG_MAX) { - cl->cl_nextexe += slept; - if (cl->cl_nextexe < now || cl->cl_nextexe > TIME_T_MAX) { - /* either there was an integer overflow, or the slept time is incorrect - * (e.g. fcron didn't shut down cleanly and the fcrontab wasn't saved correctly) */ - error - ("Error while setting next exe time for job '%s': cl_nextexe" - " overflowed (case2). now=%lu, cl_timefreq=%lu, cl_nextexe=%lu. " - "Did fcron shut down cleanly?", - cl->cl_shell, now, cl->cl_timefreq, cl->cl_nextexe); - error - ("Setting cl_nextexe to now+cl_timefreq to prevent an infinite loop."); - cl->cl_nextexe = now + cl->cl_timefreq; - error("next execution will now be at %ld.", cl->cl_nextexe); - } - } - } - - insert_nextexe(cl); - } - - if (debug_opt && !(is_runonce(cl->cl_option) && is_hasrun(cl->cl_option))) { - struct tm *ftime; - ftime = localtime(&(cl->cl_nextexe)); - debug(" cmd '%s' next exec %04d-%02d-%02d wday:%d %02d:%02d:%02d" - " (system time)", - cl->cl_shell, (ftime->tm_year + 1900), (ftime->tm_mon + 1), - ftime->tm_mday, ftime->tm_wday, ftime->tm_hour, ftime->tm_min, - ftime->tm_sec); - } + set_next_exe_startup(cl, is_system_startup ? CONTEXT_BOOT : CONTEXT_DEFAULT, + sleep_duration); /* add the current line to the list, and allocate a new line */ if ((cl->cl_id = next_id++) >= ULONG_MAX - 1) { diff --git a/configure.in b/configure.in index 590d5b2..bc9fbd2 100644 --- a/configure.in +++ b/configure.in @@ -304,10 +304,14 @@ AC_ARG_WITH(piddir, ) AC_MSG_RESULT([$PIDDIR]) PIDFILE="${PIDDIR}/fcron.pid" +dnl Is it the first time fcron starts since the system rebooted: REBOOT_LOCK="${PIDDIR}/fcron.reboot" +dnl Used to notify fcron the system was suspended, and for how long: +SUSPEND_FILE="${PIDDIR}/fcron.suspend" AC_SUBST(PIDDIR) AC_SUBST(PIDFILE) AC_SUBST(REBOOT_LOCK) +AC_SUBST(SUSPEND_FILE) FIFODIR="${localstatedir}/run" AC_MSG_CHECKING(location of fifo files) diff --git a/database.c b/database.c index c76e629..2bd04fb 100644 --- a/database.c +++ b/database.c @@ -1274,7 +1274,6 @@ set_next_exe(cl_t * line, char option, int info_fd) } - void set_next_exe_notrun(cl_t * line, char context) /* set the time of the next execution and send a mail to tell user his job @@ -1336,6 +1335,212 @@ set_next_exe_notrun(cl_t * line, char context) } +void +reschedule_all_on_resume(const time_t sleep_duration) +/* walk through all files and lines, update the schedule and run as appropriate */ +{ + cf_t *file = NULL; + + for (file = file_base; file; file = file->cf_next) { + cl_t *line = NULL; + + debug("Re-scheduling %s's jobs...", file->cf_user); + + for (line = file->cf_line_base; line; line = line->cl_next) { + set_next_exe_startup(line, CONTEXT_RESUME, sleep_duration); + } + + } +} + +void +set_next_exe_startup(struct cl_t *cl, const int context, + const time_t sleep_duration) + /* Schedule the next execution at startup (or a new file, + * or after a computer suspend/hibernation */ +{ + int is_new_file = (sleep_duration == now) ? 1 : 0; + + /* if job was stopped during execution: insert it in lavg or serial queue + * if it was in one when fcron stopped. + * This only applies to fcron startup and not system resume, as in the latter case + * the job would still be running in the background: in that case we leave it + * to finish normally and we don't run them again. */ + /* NOTE: + * - runatreboot has higher priority than jobs that were still running + * when fcron stopped, because the former will get run quicker as they are not + * put into the serial queue. runatreboot jobs will be handled later on. */ + if (context != CONTEXT_RESUME && cl->cl_numexe > 0 + && !is_runatreboot(cl->cl_option)) { + + cl->cl_numexe = 0; + if (is_lavg(cl->cl_option)) { + if (!is_strict(cl->cl_option)) + add_lavg_job(cl, -1); + } + else if (is_serial(cl->cl_option) + || is_serial_once(cl->cl_option)) + add_serial_job(cl, -1); + else { + /* job has been stopped during execution : + * launch it again */ + warn("job '%s' did not finish : running it again.", cl->cl_shell); + set_serial_once(cl->cl_option); + add_serial_job(cl, -1); + } + } + + if (context == CONTEXT_BOOT + || (context == CONTEXT_DEFAULT && is_volatile(cl->cl_option))) { + clear_hasrun(cl->cl_option); + } + + if (is_runonce(cl->cl_option) && is_hasrun(cl->cl_option)) { + /* if we get here, then context != CONTEXT_BOOT and_volatile is false */ + /* do nothing: don't re-schedule or add to the job queue */ + explain("job '%s' with runonce set has already run since last " + "system startup: not re-scheduling.", cl->cl_shell); + } + else if (is_td(cl->cl_option)) { + + /* set the time and date of the next execution */ + if (context == CONTEXT_BOOT && is_runatreboot(cl->cl_option)) { + + if (is_notice_notrun(cl->cl_option)) { + + if (cl->cl_runfreq == 1) { + /* %-line */ + set_next_exe_notrun(cl, SYSDOWN_RUNATREBOOT); + } + else { + /* set next exe and mail user */ + time_t since = cl->cl_nextexe; + + cl->cl_nextexe = now; + mail_notrun_time_t(cl, SYSDOWN, since); + } + + } + else { + cl->cl_nextexe = now; + } + + insert_nextexe(cl); + + } + else if (cl->cl_nextexe <= now) { + if (cl->cl_nextexe == 0) + /* the is a line from a new file */ + set_next_exe(cl, NO_GOTO, -1); + else if (cl->cl_runfreq == 1 && is_notice_notrun(cl->cl_option)) + set_next_exe_notrun(cl, SYSDOWN); + else if (is_bootrun(cl->cl_option) && !is_new_file + && cl->cl_runfreq != 1) { + if (cl->cl_remain > 0 && --cl->cl_remain > 0) { + debug(" cl_remain: %d", cl->cl_remain); + } + else { + /* run bootrun jobs */ + cl->cl_remain = cl->cl_runfreq; + debug(" boot-run '%s'", cl->cl_shell); + if (!is_lavg(cl->cl_option)) { + set_serial_once(cl->cl_option); + add_serial_job(cl, -1); + } + else + add_lavg_job(cl, -1); + } + set_next_exe(cl, STD, -1); + } + else { + if (is_notice_notrun(cl->cl_option)) { + /* set next exe and mail user */ + time_t since = cl->cl_nextexe; + + set_next_exe(cl, NO_GOTO, -1); + mail_notrun_time_t(cl, SYSDOWN, since); + + } + else + set_next_exe(cl, NO_GOTO, -1); + } + } + else { + /* value of nextexe is valid : just insert line in queue unless + * this is a system resume, in which case the line will be there + * already: */ + if (context != CONTEXT_RESUME) { + insert_nextexe(cl); + } + } + } + else { /* is_td(cl->cl_option) */ + if (cl->cl_timefreq < 10) { + error("Invalid timefreq %ld for job '%s': setting to 1 day", + cl->cl_timefreq, cl->cl_shell); + cl->cl_timefreq = 3600 * 24; + } + + /* standard @-lines */ + if (context == CONTEXT_BOOT && is_runatreboot(cl->cl_option)) { + cl->cl_nextexe = now; + } + else if (is_new_file || is_volatile(cl->cl_option) + || (context == CONTEXT_BOOT && (is_rebootreset(cl->cl_option) + || is_runonce(cl->cl_option)))) { + /* cl_first is always saved to disk for a volatile line */ + if (cl->cl_first == LONG_MAX) { + cl->cl_nextexe = TIME_T_MAX; + } + else { + cl->cl_nextexe = now + cl->cl_first; + if (cl->cl_nextexe < now || cl->cl_nextexe > TIME_T_MAX) { + /* there was an integer overflow! */ + error + ("Error while setting next exe time for job '%s': cl_nextexe" + " overflowed (case1). now=%lu, cl_timefreq=%lu, cl_nextexe=%lu.", + cl->cl_shell, now, cl->cl_timefreq, cl->cl_nextexe); + error + ("Setting cl_nextexe to TIME_T_MAX to prevent an infinite loop."); + cl->cl_nextexe = TIME_T_MAX; + } + } + } + else { + if (cl->cl_nextexe != LONG_MAX) { + cl->cl_nextexe += sleep_duration; + if (cl->cl_nextexe < now || cl->cl_nextexe > TIME_T_MAX) { + /* either there was an integer overflow, or the sleep_duration time is incorrect + * (e.g. fcron didn't shut down cleanly and the fcrontab wasn't saved correctly) */ + error + ("Error while setting next exe time for job '%s': cl_nextexe" + " overflowed (case2). now=%lu, cl_timefreq=%lu, cl_nextexe=%lu. " + "Did fcron shut down cleanly?", + cl->cl_shell, now, cl->cl_timefreq, cl->cl_nextexe); + error + ("Setting cl_nextexe to now+cl_timefreq to prevent an infinite loop."); + cl->cl_nextexe = now + cl->cl_timefreq; + error("next execution will now be at %ld.", cl->cl_nextexe); + } + } + } + + insert_nextexe(cl); + } + + if (debug_opt && !(is_runonce(cl->cl_option) && is_hasrun(cl->cl_option))) { + struct tm *ftime; + ftime = localtime(&(cl->cl_nextexe)); + debug(" cmd '%s' next exec %04d-%02d-%02d wday:%d %02d:%02d:%02d" + " (system time)", + cl->cl_shell, (ftime->tm_year + 1900), (ftime->tm_mon + 1), + ftime->tm_mday, ftime->tm_wday, ftime->tm_hour, ftime->tm_min, + ftime->tm_sec); + } + +} + + void mail_notrun_time_t(cl_t * line, char context, time_t since_time_t) /* Same as mail_notrun() but with 'since' defined as a time_t instead of a struct tm */ @@ -1403,9 +1608,10 @@ mail_notrun(cl_t * line, char context, struct tm *since) switch (context) { case SYSDOWN: + case SYSDOWN_RUNATREBOOT: fprintf(mailf, "Line '%s' has not run since and including " "%04d-%02d-%02d wday:%d %02d:%02d (timezone=%s)\n" - "due to system's down state.\n", + "due to system's down or suspended state.\n", line->cl_shell, (since->tm_year + 1900), (since->tm_mon + 1), since->tm_mday, since->tm_wday, since->tm_hour, since->tm_min, (line->cl_tz) ? line->cl_tz : "system's"); @@ -1441,6 +1647,8 @@ mail_notrun(cl_t * line, char context, struct tm *since) "serialonce and/or fcron's option -m.\n"); fprintf(mailf, "Note that job '%s' has not run.\n", line->cl_shell); break; + default: + error("mail_notrun() called with unkown context '%c'. Ignoring."); } /* become user (for security reasons) */ diff --git a/database.h b/database.h index d3fb5eb..7b9e0f0 100644 --- a/database.h +++ b/database.h @@ -40,6 +40,12 @@ extern void set_next_exe_notrun(struct cl_t *line, char context); #define SYSDOWN 2 /* set_next_exe_notrun() : context */ #define QUEUE_FULL 3 /* set_next_exe_notrun() : context */ #define SYSDOWN_RUNATREBOOT 4 /* set_next_exe_notrun() : context */ +extern void set_next_exe_startup(struct cl_t *cl, const int context, + const time_t sleep_duration); +#define CONTEXT_DEFAULT 1 /* a new file was loaded (and the machine didn't just boot) */ +#define CONTEXT_BOOT 2 /* the machine just booted */ +#define CONTEXT_RESUME 3 /* the machine just resumed from suspend/hibernation */ +extern void reschedule_all_on_resume(const time_t sleep_duration); extern void mail_notrun(struct cl_t *line, char context, struct tm *since); extern void mail_notrun_time_t(cl_t * line, char context, time_t since_time_t); extern job_t *job_queue_remove(cl_t * line); diff --git a/doc/en/faq.sgml b/doc/en/faq.sgml index 0639256..c92313d 100644 --- a/doc/en/faq.sgml +++ b/doc/en/faq.sgml @@ -90,25 +90,42 @@ every day at 2:30, it will run at 3:30 the day of this kind of DST change. - What about fcron and software suspend - (aka. suspend to RAM, to disk)? + How does fcron handle hardware suspend + (e.g. suspend to memory, or disk -- a.k.a. hibernate)? - We suppose here that you are using swsusp and the hibernate - script to do a "suspend to disk", but it should be similar - with other methods. - When you switch on your computer after a suspend to disk, - the system time will be incorrect, and will then be corrected - by the hibernate script. Before it is corrected, fcron may - compute the time and date of the next execution of a job: - the computation would then be incorrect (please see the entry - about system clock adjustment in the present FAQ). - So you should have the hibernate script stop fcron before - the suspend, and then restart it when the system is switched on, - ie. put a line like "RestartServices fcron" in your hibernate.conf - file. That way, the system time will always be correct when - fcron runs (assuming that fcron is started after the system time - is corrected). + fcron now fully supports suspend (to memory or disk). + On resume it will adjust the task schedules accordingly, run runatreboot tasks if appropriate, and report non-execution of noticenotrun tasks. + fcron will try to notice suspends by itself without external help, + by checking if it wakes up later than it expected after a sleep. + However this is far from bullet-proof, as fcron may not notice + the computer was suspended or under-estimate the suspend duration. + This is because fcron can be woken up by external events such + as receiving a signal or fcrondyn interation. There is also a risk + fcron may be woken up before the suspend code has set the computer + clock correctly after resume. If so, it may compute an incorrect + the time and date of the execution of a job (please see the entry + about system clock adjustment in the present FAQ). + Because of this, it is recommended to explicitly 'tell' fcron + about suspends and the precise suspend duration: + + + send a SIGSTOP signal to fcron when going into suspend: + $ kill -STOP $(cat &fcron.pid;). + This is to make sure it will read reschedule jobs before running any when the computer resumes. + + + Write the suspend duration as a number of seconds (written as a string) in the suspend file, e.g. + $ echo 2342 > &suspendfile; + + + Wake up fcron and tell it to process the suspend file and reschedule tasks accordingly: + $ kill -CONT $(cat &fcron.pid;) + + Alternatively you could have fcron stop on suspend and restart on resume. + However the main drawback would then be that tasks running at the time of suspend + would be run again at resume, even though they may not have been stopped and finish on resume + (i.e. they would run twice, with a second instance started before the first one finishes). diff --git a/doc/en/fcron.8.sgml b/doc/en/fcron.8.sgml index e07c69b..c2562c4 100644 --- a/doc/en/fcron.8.sgml +++ b/doc/en/fcron.8.sgml @@ -260,6 +260,12 @@ also toggles on/off the printing on debug info on syslog. update (this signal is used by &fcrontab;(5)) + + SIGCONT + + Notify &fcron; that the system was just resumed from suspend (to memory or disk). This will trigger &fcron; to read the &suspendfile; and update the task schedules accordingly. + + @@ -302,6 +308,12 @@ name per line, special name "all" acts for everyone) &fcron;. Take a look at &pam;(8) for more details. + + &suspendfile; + + Location of &fcron; suspend file. This should be used to let &fcron; know how long the system was suspended (to memory or disk), so as task schedules can be updated accordingly. The file must be owned by &rootname;:&rootgroup;, and not writable by others. When the system resumes, write the number of seconds (as a string) the system was suspended into this file, and then send a SIGCONT signal to make fcron process (and then delete) that file. + + diff --git a/doc/en/fcron.conf.5.sgml b/doc/en/fcron.conf.5.sgml index 9a3ec04..590cd38 100644 --- a/doc/en/fcron.conf.5.sgml +++ b/doc/en/fcron.conf.5.sgml @@ -42,19 +42,25 @@ optional. Trailing blanks are also ignored. Valid variables in a fcron.conf file - fcrontabs=directory + fcrontabs=directory (&fcrontabsdir;) &Fcron; spool directory. - pidfile=file-path + pidfile=file-path (&fcron.pid;) Location of &fcron; pid file (needed by &fcrontab; to work properly). + + suspendfile=file-path +(&suspendfile;) + + Location of &fcron; suspend file. This should be used to let fcron know how long the system was suspended (to memory or disk), so as task schedules can be updated accordingly. + fifofile=file-path diff --git a/doc/en/fcrontab.5.sgml b/doc/en/fcrontab.5.sgml index d66b830..f2e2d05 100644 --- a/doc/en/fcrontab.5.sgml +++ b/doc/en/fcrontab.5.sgml @@ -82,9 +82,10 @@ you should use the option &optmailto; directly. Entries based on elapsed system up time - The entries of commands which have to be run once every m + Jobs are scheduled to run once every m minutes of &fcron;'s execution (which is normally the same as m minutes of -system's execution) are of the form +system's execution). The time a system is suspended (to memory or disk) is considered +as down time. To configure such a job, use configuration lines of the form: @options frequency command where frequency is a time value of the form value*multiplier+value*multiplier+...+value-in-minutes as "12h02" or "3w2d5h1". @@ -471,7 +472,7 @@ linkend="uptent">lines based on elapsed system up time is recommended ins # run check_laptop_logs.sh after every hour of system up time: @ 60 check_laptop_logs.sh # run check_web_server.sh and check_file_server.sh every night between midnight -# and 3am, one by after the other: +# and 3am, one at a time: %nightly,serial * 0-3 check_web_server.sh %nightly,serial * 0-3 check_file_server.sh # Run compress_home_made_app_log_files.sh once a month, only at night @@ -515,7 +516,7 @@ abbreviation. The options are (default value in parentheses): b boolean(false) - Run an &-line at &fcron;'s startup if it should + Run an &-line at &fcron;'s startup (or system's resume after suspend/hibernation) if it should have run during system down time. @@ -709,7 +710,7 @@ reduce disk access on a laptop. boolean(false) Should &fcron; mail user to report the -non-execution of a %-job or an &-job? (because of system down state for both or +non-execution of a %-job or an &-job? (because of system down state (including suspend/hibernation) for both or a too high system load average for the latter) &seealso; options &optlavg;, &optstrict;. diff --git a/doc/en/todo.sgml b/doc/en/todo.sgml index 6c5956f..68c123f 100644 --- a/doc/en/todo.sgml +++ b/doc/en/todo.sgml @@ -27,7 +27,13 @@ A copy of the license is included in gfdl.sgml. Option to compile and install from git sources without generating the doc - register in OS suspend/hibernate mechanism to stop fcron when going to sleep and start it again when resuming from sleep (see FAQ entry). + add systemd suspend hooks for fcron (contribution welcome) + + + add a 'runatresume' option, to run when the computer resumes? (similar to runatreboot) + + + use ask_user() in boot-install diff --git a/doc/fcron-doc.mod.in b/doc/fcron-doc.mod.in index 572e166..f056733 100644 --- a/doc/fcron-doc.mod.in +++ b/doc/fcron-doc.mod.in @@ -27,6 +27,7 @@ + @@ -39,6 +40,8 @@ + + diff --git a/fcron.c b/fcron.c index fb425ed..dce220e 100644 --- a/fcron.c +++ b/fcron.c @@ -44,6 +44,9 @@ RETSIGTYPE sigterm_handler(int x); RETSIGTYPE sigchild_handler(int x); RETSIGTYPE sigusr1_handler(int x); RETSIGTYPE sigusr2_handler(int x); +RETSIGTYPE sigcont_handler(int x); +long int get_suspend_duration(time_t slept_from); +void check_suspend(time_t slept_from, time_t planned_sleep); int parseopt(int argc, char *argv[]); void get_lock(void); int is_system_reboot(void); @@ -85,6 +88,7 @@ gid_t rootgid = 0; char sig_conf = 0; /* is 1 when we got a SIGHUP, 2 for a SIGUSR1 */ char sig_chld = 0; /* is 1 when we got a SIGCHLD */ char sig_debug = 0; /* is 1 when we got a SIGUSR2 */ +char sig_cont = 0; /* is 1 when we got a SIGCONT */ /* jobs database */ struct cf_t *file_base; /* point to the first file of the list */ @@ -520,6 +524,165 @@ sigusr2_handler(int x) sig_debug = 1; } +RETSIGTYPE +sigcont_handler(int x) + /* used to notify fcron of a system resume after suspend. + * However this signal could also be received in other cases. */ +{ + sig_cont = 1; +} + +long int +get_suspend_duration(time_t slept_from) + /* Return the amount of time the system was suspended (to mem or disk). + * Return 0 on error. + * + * The idea is that: + * 1) the OS sends the STOP signal to the main fcron process when suspending + * 2) the OS writes the suspend duration (as a string) into suspendfile, + * and then sends the CONT signal to the main fcron process when resuming. + * + * The main reason to do it this way instead of killing fcron and restarting + * it on resume is to better handle jobs that may already be running. + * (e.g. don't run them again when the machine resumes) */ +{ + int fd = -1; + char buf[TERM_LEN]; + int read_len = 0; + long int suspend_duration = 0; /* default value to return on error */ + struct stat s; + + if (sig_cont <= 0) { + /* signal not raised -- do nothing */ + return 0; + } + + /* the signal CONT was raised: reset the signal and check the suspendfile */ + sig_cont = 0; + + fd = open(suspendfile, O_RDONLY | O_NONBLOCK); + if (fd == -1) { + /* If the file doesn't exist, then we assume the user/system + * did a manual 'kill -STOP' / 'kill -CONT' and doesn't intend + * for fcron to account for any suspend time. + * This is not considered as an error. */ + if (errno != ENOENT) { + error_e("Could not open suspend file '%s'", suspendfile); + } + goto cleanup_return; + } + + /* check the file is a 'normal' file (e.g. not a link) and only writable + * by root -- don't allow attacker to affect job schedules, + * or delete the suspendfile */ + if (fstat(fd, &s) < 0) { + error_e("could not fstat() suspend file '%s'", suspendfile); + goto cleanup_return; + } + if (!S_ISREG(s.st_mode) || s.st_nlink != 1) { + error_e("suspend file %s is not a regular file", suspendfile); + goto cleanup_return; + } + + if (s.st_mode & S_IWOTH || s.st_uid != rootuid || s.st_gid != rootgid) { + error("suspend file %s must be owned by %s:%s and not writable by" + " others.", suspendfile, ROOTNAME, ROOTGROUP); + goto cleanup_return; + } + + /* read the content of the suspendfile into the buffer */ + read_len = read(fd, buf, sizeof(buf) - 1); + if (read_len < 0) { + /* we have to run this immediately or errno may be changed */ + error_e("Could not read suspend file '%s'", suspendfile); + goto unlink_cleanup_return; + } + if (read_len < 0) { + goto unlink_cleanup_return; + } + buf[read_len] = '\0'; + + errno = 0; + suspend_duration = strtol(buf, NULL, 10); + if (errno != 0) { + error_e("Count not parse suspend duration '%s'", buf); + suspend_duration = 0; + goto unlink_cleanup_return; + } + else if (suspend_duration < 0) { + warn("Read negative suspend_duration (%ld): ignoring."); + suspend_duration = 0; + goto unlink_cleanup_return; + } + else { + debug("Read suspend_duration of '%ld' from suspend file '%s'", + suspend_duration, suspendfile); + + if (now < slept_from + suspend_duration) { + long int time_slept = now - slept_from; + + /* we can have a couple of seconds more due to rounding up, + * but anything more should be an invalid value in suspendfile */ + explain("Suspend duration %lds in suspend file '%s' is longer than " + "we slept. This could be due to rounding. " + "Reverting to time slept %lds.", + suspend_duration, suspendfile, time_slept); + suspend_duration = time_slept; + } + } + +unlink_cleanup_return: + if (unlink(suspendfile) < 0) { + warn_e("Could not remove suspend file '%s'", suspendfile); + return 0; + } + +cleanup_return: + if (fd >= 0 && xclose(&fd) < 0) { + warn_e("Could not xclose() suspend file '%s'", suspendfile); + } + +#ifdef HAVE_SIGNAL + signal(SIGCONT, sigcont_handler); + siginterrupt(SIGCONT, 0); +#endif + + return suspend_duration; + +} + +void +check_suspend(time_t slept_from, time_t planned_sleep) + /* Check if the machine was suspended (to mem or disk), and if so + * reschedule jobs accordingly */ +{ + long int suspend_duration; /* amount of time the system was suspended */ + long int actual_sleep; /* time we actually slept */ + + suspend_duration = get_suspend_duration(slept_from); + + /* Also check if there was an unaccounted sleep duration, in case + * the OS is not configured to let fcron properly know about suspends + * via suspendfile. + * This is not perfect as we may miss some suspend time if fcron + * is woken up before the timer expiry, e.g. due to a signal + * or activity on a socket (fcrondyn). + * NOTE: the +5 second is arbitrary -- just a way to make sure + * we don't get any false positive. If the suspend or hibernate + * is very short it seems fine to simply ignore it anyway */ + actual_sleep = now - slept_from; + if (suspend_duration <= 0 && (actual_sleep - planned_sleep) > 5) { + suspend_duration = actual_sleep - planned_sleep; + } + + if (suspend_duration > 0) { + explain("suspend/hibernate detected: we woke up after %lus" + " instead of %lus. The system was suspended for %lus.", + actual_sleep, planned_sleep, suspend_duration); + reschedule_all_on_resume(suspend_duration); + } +} + int main(int argc, char **argv) @@ -641,6 +804,7 @@ main(int argc, char **argv) explain("%s[%d] " VERSION_QUOTED " started", prog_name, daemon_pid); #ifdef HAVE_SIGNAL + /* FIXME: check for errors */ signal(SIGTERM, sigterm_handler); signal(SIGHUP, sighup_handler); siginterrupt(SIGHUP, 0); @@ -650,14 +814,18 @@ main(int argc, char **argv) siginterrupt(SIGUSR1, 0); signal(SIGUSR2, sigusr2_handler); siginterrupt(SIGUSR2, 0); + signal(SIGCONT, sigcont_handler); + siginterrupt(SIGCONT, 0); /* we don't want SIGPIPE to kill fcron, and don't need to handle it */ signal(SIGPIPE, SIG_IGN); #elif HAVE_SIGSET + /* FIXME: check for errors */ sigset(SIGTERM, sigterm_handler); sigset(SIGHUP, sighup_handler); sigset(SIGCHLD, sigchild_handler); sigset(SIGUSR1, sigusr1_handler); sigset(SIGUSR2, sigusr2_handler); + sigset(SIGCONT, sigcont_handler); sigset(SIGPIPE, SIG_IGN); #endif @@ -774,6 +942,7 @@ main_loop() time_t save; /* time remaining until next save */ time_t stime; /* time to sleep until next job * execution */ + time_t slept_from; /* time it was when we went into sleep */ #ifdef HAVE_GETTIMEOFDAY struct timeval tv; /* we use usec field to get more precision */ #endif @@ -800,6 +969,9 @@ main_loop() for (;;) { + /* remember when we started to sleep -- this is to detect suspend/hibernate */ + slept_from = time(NULL); + #ifdef HAVE_GETTIMEOFDAY #ifdef FCRONDYN gettimeofday(&tv, NULL); @@ -829,9 +1001,11 @@ main_loop() now = time(NULL); + debug("\n"); + check_signal(); - debug("\n"); + check_suspend(slept_from, stime); test_jobs(); diff --git a/fcronconf.c b/fcronconf.c index 2856a71..2cf6267 100644 --- a/fcronconf.c +++ b/fcronconf.c @@ -37,6 +37,7 @@ extern gid_t rootgid; char *fcronconf = NULL; char *fcrontabs = NULL; char *pidfile = NULL; +char *suspendfile = NULL; char *fifofile = NULL; char *fcronallow = NULL; char *fcrondeny = NULL; @@ -53,6 +54,7 @@ init_conf(void) fcronconf = strdup2(ETC "/" FCRON_CONF); fcrontabs = strdup2(FCRONTABS); pidfile = strdup2(PIDFILE); + suspendfile = strdup2(SUSPEND_FILE); fifofile = strdup2(FIFOFILE); fcronallow = strdup2(ETC "/" FCRON_ALLOW); fcrondeny = strdup2(ETC "/" FCRON_DENY); @@ -73,6 +75,7 @@ free_conf(void) Free_safe(fcronconf); Free_safe(fcrontabs); Free_safe(pidfile); + Free_safe(suspendfile); Free_safe(fifofile); Free_safe(fcronallow); Free_safe(fcrondeny); @@ -164,6 +167,9 @@ read_conf(void) else if (strncmp(ptr1, "pidfile", namesize) == 0) { Set(pidfile, ptr2); } + else if (strncmp(ptr1, "suspendfile", namesize) == 0) { + Set(suspendfile, ptr2); + } else if (strncmp(ptr1, "fifofile", namesize) == 0) { Set(fifofile, ptr2); } @@ -193,6 +199,7 @@ read_conf(void) /* debug(" fcrondeny=%s", fcrondeny); */ /* debug(" fcrontabs=%s", fcrontabs); */ /* debug(" pidfile=%s", pidfile); */ +/* debug(" suspendfile=%s", suspendfile); */ /* debug(" fifofile=%s", fifofile); */ /* debug(" editor=%s", editor); */ /* debug(" shell=%s", shell); */ diff --git a/fcronconf.h b/fcronconf.h index 49af7da..a89bdd0 100644 --- a/fcronconf.h +++ b/fcronconf.h @@ -34,6 +34,7 @@ extern char *fcronallow; extern char *fcrondeny; extern char *fcrontabs; extern char *pidfile; +extern char *suspendfile; extern char *fifofile; extern char *editor; extern char *shell; diff --git a/files/fcron.conf.in b/files/fcron.conf.in index 12fb595..7b6c292 100644 --- a/files/fcron.conf.in +++ b/files/fcron.conf.in @@ -7,8 +7,9 @@ # The spool directory where fcron stores its files fcrontabs = @@FCRONTABS@ -# The locations of the pid file and the fifo file +# The locations of the pid file, suspend file and the fifo file pidfile = @@PIDFILE@ +suspendfile = @@SUSPEND_FILE@ fifofile = @@FIFOFILE@ # allow/deny files to determine which users are allowed to use fcrontab diff --git a/script/.gitignore b/script/.gitignore index 68354c2..9ddd62a 100644 --- a/script/.gitignore +++ b/script/.gitignore @@ -3,3 +3,4 @@ fcron.sh sysVinit-launcher fcron.init.suse fcron.init.systemd +fcron.suspend.sh diff --git a/script/boot-install b/script/boot-install index ca0c658..37bb34e 100755 --- a/script/boot-install +++ b/script/boot-install @@ -7,7 +7,7 @@ # the DESTSBIN directory # the value of DEBUG # the value of FCRONTABS -# the automatic answer +# the automatic answer: 0 no, 1 yes, 2 ask # the src dir PATH="/sbin:/usr/sbin:/bin:/usr/bin:/usr/X11R6/bin" @@ -31,9 +31,32 @@ DESTBIN=$2 DEBUG=$3 #DEBUG=1 FCRONTABS=$4 -ANSWER=$5 +ANSWER=$5 # Automatic answer: 0->no, 1->yes, 2->ask SRCDIR=$6 +# Return y or n (defaults to y) +# (uses the automatic answer if appropriate) +ask_user() { + if test "$ANSWER" -eq 1 ; then + echo y + elif test "$ANSWER" -eq 0 ; then + echo n + fi + + # If we get here, then ask the user interactively + ANS= + while test \( "$ANS" != "y" \) -a \( "$ANS" != "n" \) ; do + echo -n "Please answer with 'y' or 'n' (default: 'y'): " >/dev/stderr + read ANS NOTHING + test -z "$ANS" && ANS=y # Default value + done + echo $ANS + +} + +# Validate the command line arguments: +# FIXME + if test $DEBUG -eq 1; then STARTCMD="fcron -b -d" else @@ -264,6 +287,30 @@ echo "Installation process failed to install fcron in your init scripts :" echo "please do it manually." fi +# +# Now install the suspend script under the appropriate suspend framework +# +echo +if test -d /usr/lib/systemd/system-sleep ; then + # systemd + SUSPEND_DEST=/usr/lib/systemd/system-sleep/fcron.sh + echo "This system appears to run systemd. Would you like to install the systemd" + echo "suspend script under $SUSPEND_DEST?" + if test "`ask_user`" = y; then + $INSPROG -c -m 754 script/fcron.suspend.sh $SUSPEND_DEST + fi +elif test -d /etc/pm/sleep.d ; then + # pm-utils + SUSPEND_DEST=/etc/pm/sleep.d/74_fcron + echo "This system appears to run pm-utils. Would you like to install the pm-utils" + echo "suspend script under $SUSPEND_DEST?" + if test "`ask_user`" = y; then + $INSPROG -c -m 754 script/fcron.suspend.sh $SUSPEND_DEST + fi +else + echo "This script didn't find any suspend system it supports." + echo "Please install an appropriate suspend script manually." +fi if PID=`pidof fcron`; then KILL="kill -TERM $PID" diff --git a/script/fcron.suspend.sh.in b/script/fcron.suspend.sh.in new file mode 100644 index 0000000..4d0b55e --- /dev/null +++ b/script/fcron.suspend.sh.in @@ -0,0 +1,53 @@ +#!/bin/sh +# +# Suspend script for fcron. +# Designed to work under systemd and pm-utils. +# +# Install as: +# - systemd: /usr/lib/systemd/system-sleep/fcron.sh +# - pm-utils: /etc/pm/sleep.d/74_fcron +# + +PID_FILE=/usr/local/var/run/fcron.pid +SUSPEND_FILE=/usr/local/var/run/fcron.suspend +PATH=/bin:/usr/bin:/sbin:/usr/sbin +LOGGER="logger -p cron.info" + +FCRONPID=`cat $PID_FILE` + +# pm-utils: first argument will be hibernate|suspend|resume|thaw +# systemd: two arguments: 1st one is pre|post, second one is suspend|hibernate|hybrid-sleep + +case $1 in + pre|suspend|suspend_hybrid|hibernate) + # We will use the modify time of SUSPEND_FILE to know when we went into suspend: + kill -STOP "$FCRONPID" || $LOGGER "$0 $*: could not stop fcron pid '$FCRONPID'" + touch $SUSPEND_FILE || $LOGGER "$0 $*: could not touch fcron suspend file '$SUSPEND_FILE'" + $LOGGER "$0 $*: stopped pid `cat $PID_FILE` (from $PID_FILE), and touched $SUSPEND_FILE" + ;; + post|resume|thaw) + SLEEP_FROM=`stat -c %Y $SUSPEND_FILE` + if test $? -eq 0; then + NOW=`date +%s` + SLEEP_DURATION=`expr $NOW - $SLEEP_FROM` + if test $? -lt 2; then + $LOGGER "$0 $*: SLEEP_DURATION=$SLEEP_DURATION" + echo $SLEEP_DURATION > $SUSPEND_FILE + else + # something went wrong -- resume fcron without specifying + # a suspend duration as it may be wrong + $LOGGER "$0 $*: could not compute sleep duration" + rm -f $SUSPEND_FILE + fi + else + # something went wrong -- resume fcron without specifying + # a suspend duration as it may be wrong + $LOGGER "$0 $*: could not stat $SUSPEND_FILE" + rm -f $SUSPEND_FILE + fi + $LOGGER "$0 $*: resuming pid `cat $PID_FILE` (from $PID_FILE)" + kill -CONT `cat $PID_FILE` || $LOGGER "$0 $*: could not resume fcron pid '$FCRONPID'" + ;; + *) + $LOGGER "$0 $*: invalid argument." +esac -- 2.40.0