安装环境:Ubuntu20.04
1、安装 munge
apt install munge
启动 munge 服务
systemctl enable munge
systemctl start munge
2、下载 slurm
apt install mysql-server slurm-wlm slurmdbd -y
3、配置数据库
mysql 启动数据库
mysql
create user 'amax'@'localhost' identified by 'amax@1234';
(amax 为用户名,amax@1234 为密码)
create database slurm_acct_db;
grant all PRIVILEGES on slurm_acct_db.* TO 'amax'@'localhost' with grant option; quit;
systemctl start mysql.service
systemctl enable mysql.service
4、配置 slurmdbd
vim /etc/slurm-llnl/slurmdbd.conf
ArchiveEvents=yes
ArchiveJobs=yes
ArchiveResvs=yes
ArchiveSuspend=no
ArchiveTXN=no
ArchiveUsage=no
#ArchiveDir="/tmp"
ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
AuthInfo=/var/run/munge/munge.socket.2
#
# slurmDBD info
DbdAddr=localhost
DbdHost=localhost
DbdPort=6819
SlurmUser=amax
#MessageTimeout=300
DebugLevel=4
#DefaultQOS=normal,standby
LogFile=/var/log/slurm-llnl/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
StorageHost=localhost
StoragePort=3306
StoragePass=amax@1234
StorageUser=amax
StorageLoc=slurm_acct_db
PurgeEventAfter=12month
PurgeJobAfter=12month
PurgeResvAfter=2month
PurgeStepAfter=2month
PurgeSuspendAfter=1month
PurgeTXNAfter=12month
PurgeUsageAfter=12month
MaxQueryTimeRange=60-0
启动 slurmdbd 服务:
systemctl start slurmdbd
systemctl enable slurmdbd
5、配置 slurm
mkdir /var/spool/slurmd
mkdir /var/spool/slurmctld
chmod -R 777 slurmd/
chmod -R 777 slurmctld/
vim /etc/slurm-llnl/slurm.conf
# slurm.conf file generated by configurator.html. # Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information. #
ClusterName=cluster
SlurmctldHost=localhost
#SlurmctldHost=
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/cgroup
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=root
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
#TaskEpilog=
#TaskPlugin=task/affinity
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
TaskPlugin=task/cgroup
SchedulerType=sched/backfill
SelectType=select/cons_res
SelectTypeParameters=CR_CPU
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
AccountingStoragePass=/var/run/munge/munge.socket.2
#AccountingStoragePort=
AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageUser=
#AccountingStoreFlags=
JobCompHost=localhost
JobCompLoc=slurm_acct_db
JobCompPass=amax@1234
#JobCompPort=
JobCompType=jobcomp/mysql
JobCompUser=amax
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=amax CPUs=2 RealMemory=1941
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP (节点配置查看用 lscpu/slurmd -C)
6、配置 gres
vim /etc/slurm-llnl/gres.conf
# This section of this file was automatically generated by cmd. Do not edit manually! # BEGIN AUTOGENERATED SECTION -- DO NOT REMOVE
# No gres config specified
# END AUTOGENERATED SECTION -- DO NOT REMOVE
NodeName=amax Name=gpu File=/dev/nvidia[0-3]
7、配置 cgroup
vim /etc/slurm-llnl/cgroup.conf
# This section of this file was automatically generated by cmd. Do not edit manually! # BEGIN AUTOGENERATED SECTION -- DO NOT REMOVE CgroupMountpoint="/sys/fs/cgroup"
CgroupAutomount=no
TaskAffinity=no
ConstrainCores=yes
ConstrainRAMSpace=no
ConstrainSwapSpace=no
ConstrainDevices=yes
ConstrainKmemSpace=yes
AllowedRamSpace=100.00
AllowedSwapSpace=0.00
MinKmemSpace=30
MaxKmemPercent=100.00
MaxRAMPercent=100.00
MaxSwapPercent=100.00
MinRAMSpace=30
# END AUTOGENERATED SECTION -- DO NOT REMOVE
8、启动服务并设置开机自启:
systemctl start slurmctld
systemctl enable slurmctld
systemctl start slurmd
systemctl enable slurmd
9、查看服务状态:
systemctl status slurmctld
systemctl status slurmd
systemctl status slurmdbd
10、测试命令: