salt master losing children
I recently set up psdiff on a few of my servers as a basic means to monitor process activity.
It disclosed that my SaltStack master daemon — which I’m running as a non-privileged user — was losing a single child, exactly 24 hours after I had ran salt commands. This seemed to be a recurring phenomenon.
The salt server — version 0.17.5+ds-1 on Ubuntu Trusty — was running these processes:
salt 0:00 /usr/bin/salt-master
salt 0:22 \_ [salt-master] <defunct> <-- clear_old_jobs_proc
salt 0:00 \_ /usr/bin/salt-master <-- start_publisher
salt 0:01 \_ /usr/bin/salt-master <-- start_event_publisher
salt 0:04 \_ /usr/bin/salt-master <-- worker process 1..5
salt 0:03 \_ /usr/bin/salt-master as configured by the
salt 0:04 \_ /usr/bin/salt-master 'worker_threads' setting
salt 0:04 \_ /usr/bin/salt-master
salt 0:04 \_ /usr/bin/salt-master
So, why did the clear_old_jobs_proc die?
I added some debug code. Observe, the %(process)5s
in the log format
which adds the PID to log messages.
# tail /etc/salt/master.d/*
==> /etc/salt/master.d/salt-log.conf <==
log_level: info
log_fmt_logfile: '%(asctime)s,%(msecs)03.0f [%(name)-17s][%(levelname)-8s/%(process)5s] %(message)s'
==> /etc/salt/master.d/salt-security.conf <==
sign_pub_messages: True
==> /etc/salt/master.d/salt-user.conf <==
user: salt
And then a patch against master.py
:
--- /usr/lib/python2.7/dist-packages/salt/master.py.orig 2016-01-14 00:10:41.218945899 +0100
+++ /usr/lib/python2.7/dist-packages/salt/master.py 2016-01-14 00:17:28.520966737 +0100
@@ -178,6 +178,15 @@ class Master(SMaster):
SMaster.__init__(self, opts)
def _clear_old_jobs(self):
+ log.warning('entering clear_old_jobs')
+ try:
+ self.__clear_old_jobs()
+ except BaseException as e:
+ log.exception('clear_old_jobs exception: {}'.format(e))
+ raise
+ log.exception('clear_old_jobs no exception at all')
+
+ def __clear_old_jobs(self):
'''
The clean old jobs function is the general passive maintenance process
controller for the Salt master. This is where any data that needs to
Lo and behold, I got errors:
2016-01-15 00:17:54,253 [salt.master ][ERROR / 2836] clear_old_jobs exception:
[Errno 13] Permission denied: '/var/cache/salt/master/.dfnt'
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/salt/master.py", line 183, in _clear_old_jobs
self.__clear_old_jobs()
File "/usr/lib/python2.7/dist-packages/salt/master.py", line 227, in __clear_old_jobs
salt.crypt.dropfile(self.opts['cachedir'])
File "/usr/lib/python2.7/dist-packages/salt/crypt.py", line 73, in dropfile
with salt.utils.fopen(dfnt, 'w+') as fp_:
File "/usr/lib/python2.7/dist-packages/salt/utils/__init__.py", line 930, in fopen
fhandle = open(*args, **kwargs)
IOError: [Errno 13] Permission denied: '/var/cache/salt/master/.dfnt'
That’s odd, the salt-master
runs as user salt and both that file and
directory belong to that user. But, the file only has read
permissions. And as non-superuser, trying to write to my own files will
fail if it has no write permissions.
Why does it have no write permissions if we’re going to write to it?
So, grab a fresh copy of Salt from github and see if the bug is shallow.
$ python
>>> from salt.crypt import dropfile
[CRITICAL] Unable to import msgpack or msgpack_pure python modules
>>> dropfile('/tmp')
^Z
$ ls /tmp/.dfn -la
-r-------- 1 walter walter 0 jan 15 10:34 /tmp/.dfn
$ fg
>>> dropfile('/tmp')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "salt/crypt.py", line 66, in dropfile
with salt.utils.fopen(dfn, 'wb+') as fp_:
File "salt/utils/__init__.py", line 1209, in fopen
fhandle = open(*args, **kwargs)
IOError: [Errno 13] Permission denied: '/tmp/.dfn'
Yay for shallow bugs! Double yay for not needing complex setup just to test this single function. Go Salt!
This should fix things:
diff --git a/salt/crypt.py b/salt/crypt.py
index 1809c8e..30a73da 100644
--- a/salt/crypt.py
+++ b/salt/crypt.py
@@ -60,11 +60,11 @@ def dropfile(cachedir, user=None):
try:
log.info('Rotating AES key')
- if (salt.utils.is_windows() and os.path.isfile(dfn) and
- not os.access(dfn, os.W_OK)):
- os.chmod(dfn, stat.S_IWUSR)
+ if os.path.isfile(dfn) and not os.access(dfn, os.W_OK):
+ os.chmod(dfn, stat.S_IRUSR | stat.S_IWUSR)
with salt.utils.fopen(dfn, 'wb+') as fp_:
fp_.write('')
+ os.chmod(dfn, stat.S_IRUSR)
if user:
try:
import pwd
Or this patch against the packaged version:
--- /usr/lib/python2.7/dist-packages/salt/crypt.py.orig 2016-01-15 10:49:32.369935604 +0100
+++ /usr/lib/python2.7/dist-packages/salt/crypt.py 2016-01-18 14:10:35.608671989 +0100
@@ -10,9 +10,9 @@ import os
import sys
import time
import hmac
-import shutil
import hashlib
import logging
+import stat
# Import third party libs
try:
@@ -38,7 +38,6 @@ def dropfile(cachedir, user=None):
'''
Set an aes dropfile to update the publish session key
'''
- dfnt = os.path.join(cachedir, '.dfnt')
dfn = os.path.join(cachedir, '.dfn')
def ready():
@@ -70,18 +69,24 @@ def dropfile(cachedir, user=None):
aes = Crypticle.generate_key_string()
mask = os.umask(191)
- with salt.utils.fopen(dfnt, 'w+') as fp_:
- fp_.write(aes)
- if user:
- try:
- import pwd
- uid = pwd.getpwnam(user).pw_uid
- os.chown(dfnt, uid, -1)
- shutil.move(dfnt, dfn)
- except (KeyError, ImportError, OSError, IOError):
- pass
- os.umask(mask)
+ try:
+ log.info('Rotating AES key')
+
+ if os.path.isfile(dfn) and not os.access(dfn, os.W_OK):
+ os.chmod(dfn, stat.S_IRUSR | stat.S_IWUSR)
+ with salt.utils.fopen(dfn, 'wb+') as fp_:
+ fp_.write(aes)
+ os.chmod(dfn, stat.S_IRUSR)
+ if user:
+ try:
+ import pwd
+ uid = pwd.getpwnam(user).pw_uid
+ os.chown(dfn, uid, -1)
+ except (KeyError, ImportError, OSError, IOError):
+ pass
+ finally:
+ os.umask(mask) # restore original umask
def gen_keys(keydir, keyname, keysize, user=None):