diff --git a/contrib/start-scripts/linux b/contrib/start-scripts/linux index b950cf512c3..bab8b0efc67 100644 --- a/contrib/start-scripts/linux +++ b/contrib/start-scripts/linux @@ -43,14 +43,17 @@ PGLOG="$PGDATA/serverlog" # It's often a good idea to protect the postmaster from being killed by the # OOM killer (which will tend to preferentially kill the postmaster because # of the way it accounts for shared memory). Setting the OOM_SCORE_ADJ value -# to -1000 will disable OOM kill altogether. If you enable this, you probably -# want to compile PostgreSQL with "-DLINUX_OOM_SCORE_ADJ=0", so that -# individual backends can still be killed by the OOM killer. +# to -1000 will disable OOM kill altogether, which is a good thing for the +# postmaster, but not so much for individual backends. If you enable this, +# also uncomment the DAEMON_ENV line, which will instruct backends to set +# their OOM adjustments back to the default setting of zero. #OOM_SCORE_ADJ=-1000 +#DAEMON_ENV="PG_OOM_ADJUST_FILE=/proc/self/oom_score_adj" # Older Linux kernels may not have /proc/self/oom_score_adj, but instead # /proc/self/oom_adj, which works similarly except the disable value is -17. -# For such a system, enable this and compile with "-DLINUX_OOM_ADJ=0". +# For such a system, uncomment these two lines instead. #OOM_ADJ=-17 +#DAEMON_ENV="PG_OOM_ADJUST_FILE=/proc/self/oom_adj" ## STOP EDITING HERE @@ -84,7 +87,7 @@ case $1 in echo -n "Starting PostgreSQL: " test x"$OOM_SCORE_ADJ" != x && echo "$OOM_SCORE_ADJ" > /proc/self/oom_score_adj test x"$OOM_ADJ" != x && echo "$OOM_ADJ" > /proc/self/oom_adj - su - $PGUSER -c "$DAEMON -D '$PGDATA' &" >>$PGLOG 2>&1 + su - $PGUSER -c "$DAEMON_ENV $DAEMON -D '$PGDATA' &" >>$PGLOG 2>&1 echo "ok" ;; stop) @@ -97,7 +100,7 @@ case $1 in su - $PGUSER -c "$PGCTL stop -D '$PGDATA' -s -m fast -w" test x"$OOM_SCORE_ADJ" != x && echo "$OOM_SCORE_ADJ" > /proc/self/oom_score_adj test x"$OOM_ADJ" != x && echo "$OOM_ADJ" > /proc/self/oom_adj - su - $PGUSER -c "$DAEMON -D '$PGDATA' &" >>$PGLOG 2>&1 + su - $PGUSER -c "$DAEMON_ENV $DAEMON -D '$PGDATA' &" >>$PGLOG 2>&1 echo "ok" ;; reload) diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml index 9fadef5c9da..a2081c4b13a 100644 --- a/doc/src/sgml/runtime.sgml +++ b/doc/src/sgml/runtime.sgml @@ -1275,7 +1275,7 @@ sysctl -w vm.overcommit_memory=2 Another approach, which can be used with or without altering vm.overcommit_memory, is to set the process-specific - oom_score_adj value for the postmaster process to + OOM score adjustment value for the postmaster process to -1000, thereby guaranteeing it will not be targeted by the OOM killer. The simplest way to do this is to execute @@ -1284,20 +1284,28 @@ echo -1000 > /proc/self/oom_score_adj in the postmaster's startup script just before invoking the postmaster. Note that this action must be done as root, or it will have no effect; so a root-owned startup script is the easiest place to do it. If you - do this, you may also wish to build PostgreSQL - with -DLINUX_OOM_SCORE_ADJ=0 added to CPPFLAGS. - That will cause postmaster child processes to run with the normal - oom_score_adj value of zero, so that the OOM killer can still - target them at need. + do this, you should also set these environment variables in the startup + script before invoking the postmaster: + +export PG_OOM_ADJUST_FILE=/proc/self/oom_score_adj +export PG_OOM_ADJUST_VALUE=0 + + These settings will cause postmaster child processes to run with the + normal OOM score adjustment of zero, so that the OOM killer can still + target them at need. You could use some other value for + PG_OOM_ADJUST_VALUE if you want the child processes to run + with some other OOM score adjustment. (PG_OOM_ADJUST_VALUE + can also be omitted, in which case it defaults to zero.) If you do not + set PG_OOM_ADJUST_FILE, the child processes will run with the + same OOM score adjustment as the postmaster, which is unwise since the + whole point is to ensure that the postmaster has a preferential setting. Older Linux kernels do not offer /proc/self/oom_score_adj, but may have a previous version of the same functionality called /proc/self/oom_adj. This works the same except the disable - value is -17 not -1000. The corresponding - build flag for PostgreSQL is - -DLINUX_OOM_ADJ=0. + value is -17 not -1000. diff --git a/src/backend/postmaster/fork_process.c b/src/backend/postmaster/fork_process.c index f6df2de8706..5e5bd35e7e3 100644 --- a/src/backend/postmaster/fork_process.c +++ b/src/backend/postmaster/fork_process.c @@ -31,6 +31,7 @@ pid_t fork_process(void) { pid_t result; + const char *oomfilename; #ifdef LINUX_PROFILE struct itimerval prof_itimer; @@ -71,62 +72,40 @@ fork_process(void) * process sizes *including shared memory*. (This is unbelievably * stupid, but the kernel hackers seem uninterested in improving it.) * Therefore it's often a good idea to protect the postmaster by - * setting its oom_score_adj value negative (which has to be done in a - * root-owned startup script). If you just do that much, all child - * processes will also be protected against OOM kill, which might not - * be desirable. You can then choose to build with - * LINUX_OOM_SCORE_ADJ #defined to 0, or to some other value that you - * want child processes to adopt here. + * setting its OOM score adjustment negative (which has to be done in + * a root-owned startup script). Since the adjustment is inherited by + * child processes, this would ordinarily mean that all the + * postmaster's children are equally protected against OOM kill, which + * is not such a good idea. So we provide this code to allow the + * children to change their OOM score adjustments again. Both the + * file name to write to and the value to write are controlled by + * environment variables, which can be set by the same startup script + * that did the original adjustment. */ -#ifdef LINUX_OOM_SCORE_ADJ + oomfilename = getenv("PG_OOM_ADJUST_FILE"); + + if (oomfilename != NULL) { /* * Use open() not stdio, to ensure we control the open flags. Some * Linux security environments reject anything but O_WRONLY. */ - int fd = open("/proc/self/oom_score_adj", O_WRONLY, 0); + int fd = open(oomfilename, O_WRONLY, 0); /* We ignore all errors */ if (fd >= 0) { - char buf[16]; + const char *oomvalue = getenv("PG_OOM_ADJUST_VALUE"); int rc; - snprintf(buf, sizeof(buf), "%d\n", LINUX_OOM_SCORE_ADJ); - rc = write(fd, buf, strlen(buf)); + if (oomvalue == NULL) /* supply a useful default */ + oomvalue = "0"; + + rc = write(fd, oomvalue, strlen(oomvalue)); (void) rc; close(fd); } } -#endif /* LINUX_OOM_SCORE_ADJ */ - - /* - * Older Linux kernels have oom_adj not oom_score_adj. This works - * similarly except with a different scale of adjustment values. If - * it's necessary to build Postgres to work with either API, you can - * define both LINUX_OOM_SCORE_ADJ and LINUX_OOM_ADJ. - */ -#ifdef LINUX_OOM_ADJ - { - /* - * Use open() not stdio, to ensure we control the open flags. Some - * Linux security environments reject anything but O_WRONLY. - */ - int fd = open("/proc/self/oom_adj", O_WRONLY, 0); - - /* We ignore all errors */ - if (fd >= 0) - { - char buf[16]; - int rc; - - snprintf(buf, sizeof(buf), "%d\n", LINUX_OOM_ADJ); - rc = write(fd, buf, strlen(buf)); - (void) rc; - close(fd); - } - } -#endif /* LINUX_OOM_ADJ */ /* * Make sure processes do not share OpenSSL randomness state.