Skip to content

Commit

Permalink
add jobs collector
Browse files Browse the repository at this point in the history
  • Loading branch information
robert-oleynik committed Jan 22, 2025
1 parent 895c7c6 commit 4d03c86
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 0 deletions.
127 changes: 127 additions & 0 deletions jobs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/* Copyright 2025 Scalableminds
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */


package main

import (
"os/exec"
"encoding/json"
"log"
"fmt"
"io/ioutil"
"github.com/prometheus/client_golang/prometheus"
)

func JobsData() []byte {
cmd := exec.Command("squeue", "--json")
stdout, err := cmd.StdoutPipe()
if err != nil {
log.Fatal(err)
}
if err := cmd.Start(); err != nil {
log.Fatal(err)
}
out, _ := ioutil.ReadAll(stdout)
if err := cmd.Wait(); err != nil {
log.Fatal(err)
}
return out
}

type SqueueResult struct {
jobs []SqueueJob
}

type SqueueJob struct {
account string
id int `json:"job_id"`
name string
resources SqueueJobResources `json:"job_resources"`
state []string `json:"job_state"`
nodes string
partition string
groupId int `json:"group_id"`
groupName string `json:"group_name"`
userId int `json:"user_id"`
userName string `json:"user_name"`
memoryPerNode SqeueueMemoryPerNode `json:"memory_per_node"`
}

type SqueueJobResources struct {
cpus int
}

type SqeueueMemoryPerNode struct {
number int
}

func InstrumentJobs() SqueueResult {
jobs := JobsData();
var result SqueueResult
if err := json.Unmarshal(jobs, &result); err != nil {
log.Fatal(err)
}
return result
}

type JobsCollector struct {
jobs *prometheus.Desc
}

func NewJobsCollector() *JobsCollector {
labels := []string {
"account",
"job_id",
"name",
"cpus",
"memory_per_node",
"state",
"nodes",
"partition",
"group_id",
"group_name",
"user_id",
"user_name",
};
return &JobsCollector {
jobs: prometheus.NewDesc("slurm_jobs", "Description of running Slurm jobs", labels, nil),
}
}

func (jc *JobsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- jc.jobs
}

func (jc *JobsCollector) Collect(ch chan<- prometheus.Metric) {
jm := InstrumentJobs()
for _, job := range jm.jobs {
labels := []string{
job.account,
fmt.Sprintf("%s", job.id),
job.name,
fmt.Sprintf("%s", job.resources.cpus),
fmt.Sprintf("%s", job.memoryPerNode.number),
fmt.Sprintf("%s", job.state),
job.nodes,
job.partition,
fmt.Sprintf("%d", job.groupId),
job.groupName,
fmt.Sprintf("%d", job.userId),
job.userName,
}
ch <- prometheus.MustNewConstMetric(jc.jobs, prometheus.GaugeValue, 1.0, labels...)
}
}
1 change: 1 addition & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func init() {
prometheus.MustRegister(NewSchedulerCollector()) // from scheduler.go
prometheus.MustRegister(NewFairShareCollector()) // from sshare.go
prometheus.MustRegister(NewUsersCollector()) // from users.go
prometheus.MustRegister(NewJobsCollector()) // from jobs.go
}

var listenAddress = flag.String(
Expand Down

0 comments on commit 4d03c86

Please sign in to comment.