- we put the general worker node setup in config /etc/condor/config.d/00worker.conf
- cgroup setup differs between Scientific Linux/CentOS 6 and 7
- settings for running grid jobs are organized in /etc/condor/config.d/01grid.conf
- since we have grid and NAF as general users on HTCondor, we keep settings separate if relevant only to one or the other
- for NAF: todo
- each node checks every 180s its health as disk space, the state of CVMFS mounts,...
- we use GridPP's health check script with minor modifications: healthcheck_wn_condor.sh
- the check is run via a condor cron hook in the worker's config
- a fresh rebooted node will wait for 10m before becoming healthy/accepting jobs
- a worker node will only offer its resources when healthy
00worker.conf
DAEMON_LIST = MASTER, STARTD
DEFAULT_DOMAIN_NAME = desy.de
UID_DOMAIN = desy.de
FILESYSTEM_DOMAIN = $(UID_DOMAIN)
ALLOW_WRITE = *.$(UID_DOMAIN)
ALLOW_READ = *.$(UID_DOMAIN)
CONDOR_ADMIN = iMAIL@HERE.FOO
CONDOR_HOST = condor01.desy.de
COLLECTOR_NAME = Test Condor Pool - $(CONDOR_HOST)
StartJobs = true
STARTD_ATTRS = StartJobs, $(STARTD_ATTRS)
# When is this node willing to run jobs?
START = (NODE_IS_HEALTHY =?= True) && (StartJobs =?= True)
# Permanent way of stopping jobs from starting
HOSTALLOW_CONFIG = $(CONDOR_HOST)
ALLOW_CONFIG = $(CONDOR_HOST)
ENABLE_RUNTIME_CONFIG = True
RUNTIME_CONFIG_ADMIN = $(CONDOR_HOST)
STARTD.SETTABLE_ATTRS_ADMINISTRATOR = StartJobs
ENABLE_PERSISTENT_CONFIG = True
PERSISTENT_CONFIG_DIR = /etc/condor/persistent
# use one shared port
USE_SHARED_PORT = True
SHARED_PORT_ARGS = -p 9620
COLLECTOR_HOST = $(CONDOR_HOST):9618
# Enable CGROUP control
BASE_CGROUP = # SL6: htcondor # EL7: /system.slice/condor.service #
# hard: job can't access more physical memory than allocated
# soft: job can access more physical memory than allocated when there are free memory
CGROUP_MEMORY_LIMIT_POLICY = soft
# slots
NUM_SLOTS = 1
NUM_SLOTS_TYPE_1 = 1
SLOT_TYPE_1 = 100%
SLOT_TYPE_1_PARTITIONABLE = true
COUNT_HYPERTHREAD_CPUS = true
# startd hook to check if node is healthy
STARTD_CRON_JOBLIST = NODEHEALTH
STARTD_CRON_NODEHEALTH_EXECUTABLE = /etc/condor/tests/healthcheck_wn_condor.sh
STARTD_CRON_NODEHEALTH_PERIOD = 180s
STARTD_CRON_NODEHEALTH_MODE = Periodic
01grid.conf
GRID_RESOURCE = true
# start worker node allowing single core and multi core jobs
# to push mcore jobs, resources are partly drained and allocated to mcore-only
# https://www.gridpp.ac.uk/wiki/Example_Build_of_an_ARC/Condor_Cluster#Fallow
OnlyMulticore = False # legacy
# worker node attributes
STARTD_ATTRS = $(STARTD_ATTRS), GRID_RESOURCE, OnlyMulticore
02naf.conf