SDK
Healthcheck SDK is available in Python, and allows to write your own rules.
classDiagram
class HealthcheckRule {
<<abstract>>
cluster_load ClusterLoadDataProvider
congestion CongestionDataProvider
cores CoresDataProvider
cores_memory CoresMemoryDataProvider
execution_time ExecTimeDataProvider
gpu GpuDataProvider
gpu_load GpuLoadDataProvider
job_frequency JobFrequencyDataProvider
job_load JobLoadDataProvider
jobs JobsDataProvider
kpi KpiDataProvider
memory MemoryDataProvider
memory_ratio MemoryRatioDataProvider
nodes NodesDataProvider
nodes_load NodesLoadDataProvider
occupancy OccupancyDataProvider
slowdown SlowdownDataProvider
state StateDataProvider
submission_time SubmitDateDataProvider
waiting_time WaitTimeDataProvider
evaluate()
workload()
}
class Alert {
level AlertLevel
title str
description str
reason str
metadata dict
}
class AlertLevel {
<<enumeration>>
OK
LOW
MEDIUM
HIGH
CRITICAL
}
HealthcheckRule --> Alert : evaluate() returns
Alert --> AlertLevel : uses
HealthcheckRule --> ClusterLoadDataProvider : cluster_load
HealthcheckRule --> CongestionDataProvider : congestion
HealthcheckRule --> CoresDataProvider : cores
HealthcheckRule --> CoresMemoryDataProvider : cores_memory
HealthcheckRule --> ExecTimeDataProvider : execution_time
HealthcheckRule --> GpuDataProvider : gpu
HealthcheckRule --> GpuLoadDataProvider : gpu_load
HealthcheckRule --> JobFrequencyDataProvider : job_frequency
HealthcheckRule --> JobLoadDataProvider : job_load
HealthcheckRule --> JobsDataProvider : jobs
HealthcheckRule --> KpiDataProvider : kpi
HealthcheckRule --> MemoryDataProvider : memory
HealthcheckRule --> MemoryRatioDataProvider : memory_ratio
HealthcheckRule --> NodesDataProvider : nodes
HealthcheckRule --> NodesLoadDataProvider : nodes_load
HealthcheckRule --> OccupancyDataProvider : occupancy
HealthcheckRule --> SlowdownDataProvider : slowdown
HealthcheckRule --> StateDataProvider : state
HealthcheckRule --> SubmitDateDataProvider : submission_time
HealthcheckRule --> WaitTimeDataProvider : waiting_time
ClusterLoadDataProvider --> Co2LoadDTO : get_co2_load()
ClusterLoadDataProvider --> GroupedCo2LoadDTO : get_co2_load_by_group()
ClusterLoadDataProvider --> CoreLoadDTO : get_core_load()
ClusterLoadDataProvider --> GroupedCoreLoadDTO : get_core_load_by_group()
ClusterLoadDataProvider --> CorehourLoadDTO : get_corehour_load()
ClusterLoadDataProvider --> CostLoadDTO : get_cost_load()
ClusterLoadDataProvider --> GroupedCostLoadDTO : get_cost_load_by_group()
ClusterLoadDataProvider --> PowerLoadDTO : get_power_load()
ClusterLoadDataProvider --> GroupedPowerLoadDTO : get_power_load_by_group()
CongestionDataProvider --> CongestionMetricsDTO : get_congestion_metrics()
CoresDataProvider --> CoresFlatDistributionDTO : get_flat_distribution()
CoresDataProvider --> CoresGroupedDistributionDTO : get_grouped_distribution()
CoresMemoryDataProvider --> CoresMemoryFlatMatrixDTO : get_flat_matrix()
CoresMemoryDataProvider --> CoresMemoryGroupedMatrixDTO : get_grouped_matrix()
ExecTimeDataProvider --> ExecTimeDistributionDTO : get_exec_time()
ExecTimeDataProvider --> ExecTimeVsTimelimitDTO : get_exec_time_vs_timelimit()
GpuDataProvider --> GpuFlatDistributionDTO : get_flat_distribution()
GpuDataProvider --> GpuGroupedDistributionDTO : get_grouped_distribution()
GpuLoadDataProvider --> GpuLoadDTO : get_gpu_load()
GpuLoadDataProvider --> GroupedGpuLoadDTO : get_gpu_load_by_group()
GpuLoadDataProvider --> GpuhourLoadDTO : get_gpuhour_load()
JobFrequencyDataProvider --> InterArrivalDTO : get_interarrival()
JobFrequencyDataProvider --> dict : get_interarrival_grouped()
JobFrequencyDataProvider --> JobFrequencyDTO : get_job_frequency()
JobFrequencyDataProvider --> list : get_job_frequency_grouped()
JobLoadDataProvider --> JobLoadDTO : get_job_load()
JobLoadDataProvider --> GroupedJobLoadDTO : get_job_load_by_group()
JobsDataProvider --> JobFieldsDTO : get_available_fields()
JobsDataProvider --> JobCountDTO : get_job_count()
JobsDataProvider --> JobsPageDTO : get_jobs_page()
KpiDataProvider --> CarbonFootprintDTO : get_carbon_footprint()
KpiDataProvider --> HardwareStatsDTO : get_core_stats()
KpiDataProvider --> CostMetricsDTO : get_cost_metrics()
KpiDataProvider --> EnergyMetricsDTO : get_energy_metrics()
KpiDataProvider --> HardwareStatsDTO : get_gpu_stats()
KpiDataProvider --> int : get_job_count()
KpiDataProvider --> LogsMetricsDTO : get_logs_metrics()
KpiDataProvider --> int : get_user_count()
MemoryDataProvider --> MemoryFlatDistributionDTO : get_memory_flat_distribution()
MemoryDataProvider --> MemoryGroupedDistributionDTO : get_memory_grouped_distribution()
MemoryRatioDataProvider --> MemoryRatioFlatDistributionDTO : get_flat_distribution()
MemoryRatioDataProvider --> MemoryRatioGroupedDistributionDTO : get_grouped_distribution()
NodesDataProvider --> NodesFlatDistributionDTO : get_flat_distribution()
NodesDataProvider --> NodesGroupedDistributionDTO : get_grouped_distribution()
NodesLoadDataProvider --> NodesJobCountDTO : get_nb_jobs_per_node()
OccupancyDataProvider --> CoresOccupancyDTO : get_cores_occupancy()
OccupancyDataProvider --> GroupedCoresOccupancyDTO : get_cores_occupancy_by_group()
OccupancyDataProvider --> NodesOccupancyDTO : get_nodes_occupancy()
OccupancyDataProvider --> GroupedNodesOccupancyDTO : get_nodes_occupancy_by_group()
SlowdownDataProvider --> SlowdownDistributionDTO : get_slowdown_distribution()
SlowdownDataProvider --> SlowdownStatsDTO : get_slowdown_stats()
StateDataProvider --> JobsStatusDTO : get_jobs_status()
StateDataProvider --> GroupedJobsStatusDTO : get_jobs_status_grouped()
StateDataProvider --> JobsStatusYmsDTO : get_jobs_status_yms()
StateDataProvider --> GroupedJobsStatusYmsDTO : get_jobs_status_yms_grouped()
SubmitDateDataProvider --> SubmitHourDTO : get_submit_hour()
SubmitDateDataProvider --> SubmitWeekdayDTO : get_submit_weekday()
WaitTimeDataProvider --> WaitTimeDistributionDTO : get_wait_time_distribution()
WaitTimeDataProvider --> WaitTimeStatsDTO : get_wait_time_stats()