cgroup setup

EL7

(thanks to Brian for the help)

BASE_CGROUP = /system.slice/condor.service

/sys/fs/cgroup/{cpu,cpuacct/memory/...}/system.slice/condor.service/condor_var_lib_condor_execute_slot1_*\@WN.FQDN.HERE/

SL6

> cat /etc/cgconfig.conf
mount {
    cpuset    = /cgroup/cpuset;
    cpu    = /cgroup/cpu;
    cpuacct    = /cgroup/cpuacct;
    memory    = /cgroup/memory;
    devices    = /cgroup/devices;
    freezer    = /cgroup/freezer;
    net_cls    = /cgroup/net_cls;
    blkio    = /cgroup/blkio;
}
group htcondor {
      cpu {}
      cpuacct {}
      memory {}
      freezer {
      blkio {}
}

/cgroup/{cpu/cpuacct/memory}/htcondor/condor_var_lib_condor_execute_slot1_*\@WN.FQDN.HERE/

00worker.conf

DAEMON_LIST = MASTER, STARTD
DEFAULT_DOMAIN_NAME = desy.de
UID_DOMAIN = desy.de
FILESYSTEM_DOMAIN = $(UID_DOMAIN)
ALLOW_WRITE = *.$(UID_DOMAIN)
ALLOW_READ = *.$(UID_DOMAIN)
CONDOR_ADMIN = iMAIL@HERE.FOO
CONDOR_HOST = condor01.desy.de
COLLECTOR_NAME = Test Condor Pool - $(CONDOR_HOST)
StartJobs = true
STARTD_ATTRS = StartJobs, $(STARTD_ATTRS)
# When is this node willing to run jobs?
START = (NODE_IS_HEALTHY =?= True) && (StartJobs =?= True)

# Permanent way of stopping jobs from starting
HOSTALLOW_CONFIG = $(CONDOR_HOST)
ALLOW_CONFIG = $(CONDOR_HOST)
ENABLE_RUNTIME_CONFIG = True
RUNTIME_CONFIG_ADMIN = $(CONDOR_HOST)
STARTD.SETTABLE_ATTRS_ADMINISTRATOR = StartJobs
ENABLE_PERSISTENT_CONFIG = True
PERSISTENT_CONFIG_DIR = /etc/condor/persistent

# use one shared port
USE_SHARED_PORT = True
SHARED_PORT_ARGS = -p 9620
COLLECTOR_HOST = $(CONDOR_HOST):9618

# Enable CGROUP control
BASE_CGROUP = # SL6: htcondor # EL7: /system.slice/condor.service
# hard: job can't access more physical memory than allocated
# soft: job can access more physical memory than allocated when there are free memory
CGROUP_MEMORY_LIMIT_POLICY = soft

# slots
NUM_SLOTS = 1
NUM_SLOTS_TYPE_1 = 1
SLOT_TYPE_1 = 100%
SLOT_TYPE_1_PARTITIONABLE = true
COUNT_HYPERTHREAD_CPUS = true

# startd hook to check if node is healthy
STARTD_CRON_JOBLIST = NODEHEALTH
STARTD_CRON_NODEHEALTH_EXECUTABLE = /etc/condor/tests/healthcheck_wn_condor.sh
STARTD_CRON_NODEHEALTH_PERIOD = 180s
STARTD_CRON_NODEHEALTH_MODE = Periodic

 

01grid.conf

GRID_RESOURCE = true

# start worker node allowing single core and multi core jobs
# to push mcore jobs, resources are partly drained and allocated to mcore-only
# https://www.gridpp.ac.uk/wiki/Example_Build_of_an_ARC/Condor_Cluster#Fallow
OnlyMulticore = False # legacy

# worker node attributes
STARTD_ATTRS = $(STARTD_ATTRS), GRID_RESOURCE, OnlyMulticore

02naf.conf