mpi_cluster_tools 0.1.5

A collection of cluster management commands for HPC environments
[
    {
        "AccountingGroup": "everybody",
        "ActivationSetupDuration": 1,
        "Args": "/home/barmstrong/.local/bin/uv run --no-sync -m src.main +experiment=linoss/prob_gw dataset.root_path=data/gw train.batch_size=256 train.val_check_interval=100",
        "AssignedGPUs": "GPU-b6357382",
        "AutoClusterAttrs": "AccountingGroup,GPUs_Capability,GPUs_DeviceName,GPUs_DriverVersion,GPUs_ECCEnabled,GPUs_GlobalMemoryMb,JobPrio,MachineLastMatchTime,Offline,Owner,PartitionableSlot,RequestCpus,RequestDisk,RequestGPUs,RequestMemory,Special,ConcurrencyLimits,FlockTo,Rank,Requirements,TotalJobRuntime,DiskUsage,FileSystemDomain",
        "AutoClusterId": 833,
        "BytesRecvd": 0.0,
        "BytesSent": 0.0,
        "ClusterId": 16337546,
        "Cmd": "/home/barmstrong/cluster/htcondor/cuda_wrapper.sh",
        "CommittedSlotTime": 0,
        "CommittedSuspensionTime": 0,
        "CommittedTime": 0,
        "CondorPlatform": "$CondorPlatform: X86_64-Ubuntu_22.04 $",
        "CondorVersion": "$CondorVersion: 10.0.9 2023-09-28 BuildID: 678199 PackageID: 10.0.9-1.1 $",
        "CoreSize": 0,
        "CpusProvisioned": 16,
        "CumulativeRemoteSysCpu": 0.0,
        "CumulativeRemoteUserCpu": 0.0,
        "CumulativeSlotTime": 0,
        "CumulativeSuspensionTime": 0,
        "CurrentHosts": 1,
        "DiskProvisioned": 2892100,
        "DiskUsage": 1,
        "DiskUsage_RAW": 1,
        "EncryptExecuteDirectory": false,
        "EnteredCurrentStatus": 1754665417,
        "Environment": "",
        "Err": "outputs/16337546.err",
        "ExecutableSize": 1,
        "ExecutableSize_RAW": 1,
        "ExitBySignal": false,
        "ExitStatus": 0,
        "FileSystemDomain": "cluster.is.localnet",
        "GlobalJobId": "sched.cluster.is.localnet#16337546.0#1754665416",
        "GPUsProvisioned": 1,
        "ImageSize": 1,
        "ImageSize_RAW": 1,
        "In": "/dev/null",
        "Iwd": "/lustre/home/barmstrong/ligoss",
        "JobCurrentStartDate": 1754665417,
        "JobCurrentStartExecutingDate": 1754665418,
        "JobLeaseDuration": 2400,
        "JobNotification": 0,
        "JobPrio": -750,
        "JobRunCount": 1,
        "JobStartDate": 1754665417,
        "JobStatus": 2,
        "JobSubmitMethod": 0,
        "JobUniverse": 5,
        "LastJobLeaseRenewal": 1754665609,
        "LastJobStatus": 1,
        "LastMatchTime": 1754665417,
        "LastSuspensionTime": 0,
        "LeaveJobInQueue": false,
        "MachineAttrMinRunningPrice0": 25,
        "MaxHosts": 1,
        "MemoryProvisioned": 64000,
        "MinHosts": 1,
        "MyType": "Job",
        "NumCkpts": 0,
        "NumCkpts_RAW": 0,
        "NumJobCompletions": 0,
        "NumJobMatches": 1,
        "NumJobStarts": 0,
        "NumRestarts": 0,
        "NumShadowStarts": 1,
        "NumSystemHolds": 0,
        "OnExitHold": false,
        "OnExitRemove": true,
        "OrigMaxHosts": 1,
        "Out": "outputs/16337546.out",
        "Owner": "barmstrong",
        "PeriodicHold": false,
        "PeriodicRelease": false,
        "PeriodicRemove": "\/Expr((JobStatus is 2) && ((CurrentTime - JobCurrentStartDate) >= 16000))\/",
        "ProcId": 0,
        "PublicClaimId": "<172.22.2.136:9618?addrs=172.22.2.136-9618&alias=g136.internal.cluster.is.localnet&noUDP&sock=startd_5342_bf03>#1742565954#19256#...",
        "QDate": 1754665416,
        "Rank": 0.0,
        "RemoteHost": "slot1_5@g136.internal.cluster.is.localnet",
        "RemoteSlotID": 1,
        "RemoteSysCpu": 0.0,
        "RemoteUserCpu": 0.0,
        "RemoteWallClockTime": 0.0,
        "RequestCpus": 16,
        "RequestDisk": "\/Expr(DiskUsage)\/",
        "RequestGPUs": 1,
        "RequestMemory": 64000,
        "Requirements": "\/Expr((TARGET.CUDAGlobalMemoryMb > 60000) && (TARGET.Arch == \"X86_64\") && (TARGET.OpSys == \"LINUX\") && (TARGET.Disk >= RequestDisk) && (TARGET.Memory >= RequestMemory) && (TARGET.Cpus >= RequestCpus) && (TARGET.GPUs >= RequestGPUs) && ((TARGET.FileSystemDomain == MY.FileSystemDomain) || (TARGET.HasFileTransfer)))\/",
        "RootDir": "/",
        "ServerTime": 1754665661,
        "ShadowBday": 1754665417,
        "ShouldTransferFiles": "IF_NEEDED",
        "StartdIpAddr": "<172.22.2.136:9618?addrs=172.22.2.136-9618&alias=g136.internal.cluster.is.localnet&noUDP&sock=startd_5342_bf03>",
        "StartdPrincipal": "execute-side@matchsession/172.22.2.136",
        "StreamErr": false,
        "StreamOut": false,
        "TargetType": "Machine",
        "TotalSubmitProcs": 1,
        "TotalSuspensions": 0,
        "TransferExecutable": false,
        "TransferIn": false,
        "TransferInputSizeMB": 0,
        "TransferInputStats": {},
        "TransferOutputStats": {},
        "User": "barmstrong@cluster.is.localnet",
        "UserLog": "/lustre/home/barmstrong/ligoss/outputs/16337546.log",
        "WhenToTransferOutput": "ON_EXIT"
    }
]