SDK

Healthcheck SDK is available in Python, and allows to write your own rules.

        classDiagram
    class HealthcheckRule {
        <<abstract>>
        cluster_load ClusterLoadDataProvider
        congestion CongestionDataProvider
        cores CoresDataProvider
        cores_memory CoresMemoryDataProvider
        execution_time ExecTimeDataProvider
        gpu GpuDataProvider
        gpu_load GpuLoadDataProvider
        job_frequency JobFrequencyDataProvider
        job_load JobLoadDataProvider
        jobs JobsDataProvider
        kpi KpiDataProvider
        memory MemoryDataProvider
        memory_ratio MemoryRatioDataProvider
        nodes NodesDataProvider
        nodes_load NodesLoadDataProvider
        occupancy OccupancyDataProvider
        slowdown SlowdownDataProvider
        state StateDataProvider
        submission_time SubmitDateDataProvider
        waiting_time WaitTimeDataProvider
        evaluate()
        workload()
    }
    class Alert {
        level AlertLevel
        title str
        description str
        reason str
        metadata dict
    }
    class AlertLevel {
        <<enumeration>>
        OK
        LOW
        MEDIUM
        HIGH
        CRITICAL
    }
    HealthcheckRule --> Alert : evaluate() returns
    Alert --> AlertLevel : uses
    HealthcheckRule --> ClusterLoadDataProvider : cluster_load
    HealthcheckRule --> CongestionDataProvider : congestion
    HealthcheckRule --> CoresDataProvider : cores
    HealthcheckRule --> CoresMemoryDataProvider : cores_memory
    HealthcheckRule --> ExecTimeDataProvider : execution_time
    HealthcheckRule --> GpuDataProvider : gpu
    HealthcheckRule --> GpuLoadDataProvider : gpu_load
    HealthcheckRule --> JobFrequencyDataProvider : job_frequency
    HealthcheckRule --> JobLoadDataProvider : job_load
    HealthcheckRule --> JobsDataProvider : jobs
    HealthcheckRule --> KpiDataProvider : kpi
    HealthcheckRule --> MemoryDataProvider : memory
    HealthcheckRule --> MemoryRatioDataProvider : memory_ratio
    HealthcheckRule --> NodesDataProvider : nodes
    HealthcheckRule --> NodesLoadDataProvider : nodes_load
    HealthcheckRule --> OccupancyDataProvider : occupancy
    HealthcheckRule --> SlowdownDataProvider : slowdown
    HealthcheckRule --> StateDataProvider : state
    HealthcheckRule --> SubmitDateDataProvider : submission_time
    HealthcheckRule --> WaitTimeDataProvider : waiting_time
    ClusterLoadDataProvider --> Co2LoadDTO : get_co2_load()
    ClusterLoadDataProvider --> GroupedCo2LoadDTO : get_co2_load_by_group()
    ClusterLoadDataProvider --> CoreLoadDTO : get_core_load()
    ClusterLoadDataProvider --> GroupedCoreLoadDTO : get_core_load_by_group()
    ClusterLoadDataProvider --> CorehourLoadDTO : get_corehour_load()
    ClusterLoadDataProvider --> CostLoadDTO : get_cost_load()
    ClusterLoadDataProvider --> GroupedCostLoadDTO : get_cost_load_by_group()
    ClusterLoadDataProvider --> PowerLoadDTO : get_power_load()
    ClusterLoadDataProvider --> GroupedPowerLoadDTO : get_power_load_by_group()
    CongestionDataProvider --> CongestionMetricsDTO : get_congestion_metrics()
    CoresDataProvider --> CoresFlatDistributionDTO : get_flat_distribution()
    CoresDataProvider --> CoresGroupedDistributionDTO : get_grouped_distribution()
    CoresMemoryDataProvider --> CoresMemoryFlatMatrixDTO : get_flat_matrix()
    CoresMemoryDataProvider --> CoresMemoryGroupedMatrixDTO : get_grouped_matrix()
    ExecTimeDataProvider --> ExecTimeDistributionDTO : get_exec_time()
    ExecTimeDataProvider --> ExecTimeVsTimelimitDTO : get_exec_time_vs_timelimit()
    GpuDataProvider --> GpuFlatDistributionDTO : get_flat_distribution()
    GpuDataProvider --> GpuGroupedDistributionDTO : get_grouped_distribution()
    GpuLoadDataProvider --> GpuLoadDTO : get_gpu_load()
    GpuLoadDataProvider --> GroupedGpuLoadDTO : get_gpu_load_by_group()
    GpuLoadDataProvider --> GpuhourLoadDTO : get_gpuhour_load()
    JobFrequencyDataProvider --> InterArrivalDTO : get_interarrival()
    JobFrequencyDataProvider --> dict : get_interarrival_grouped()
    JobFrequencyDataProvider --> JobFrequencyDTO : get_job_frequency()
    JobFrequencyDataProvider --> list : get_job_frequency_grouped()
    JobLoadDataProvider --> JobLoadDTO : get_job_load()
    JobLoadDataProvider --> GroupedJobLoadDTO : get_job_load_by_group()
    JobsDataProvider --> JobFieldsDTO : get_available_fields()
    JobsDataProvider --> JobCountDTO : get_job_count()
    JobsDataProvider --> JobsPageDTO : get_jobs_page()
    KpiDataProvider --> CarbonFootprintDTO : get_carbon_footprint()
    KpiDataProvider --> HardwareStatsDTO : get_core_stats()
    KpiDataProvider --> CostMetricsDTO : get_cost_metrics()
    KpiDataProvider --> EnergyMetricsDTO : get_energy_metrics()
    KpiDataProvider --> HardwareStatsDTO : get_gpu_stats()
    KpiDataProvider --> int : get_job_count()
    KpiDataProvider --> LogsMetricsDTO : get_logs_metrics()
    KpiDataProvider --> int : get_user_count()
    MemoryDataProvider --> MemoryFlatDistributionDTO : get_memory_flat_distribution()
    MemoryDataProvider --> MemoryGroupedDistributionDTO : get_memory_grouped_distribution()
    MemoryRatioDataProvider --> MemoryRatioFlatDistributionDTO : get_flat_distribution()
    MemoryRatioDataProvider --> MemoryRatioGroupedDistributionDTO : get_grouped_distribution()
    NodesDataProvider --> NodesFlatDistributionDTO : get_flat_distribution()
    NodesDataProvider --> NodesGroupedDistributionDTO : get_grouped_distribution()
    NodesLoadDataProvider --> NodesJobCountDTO : get_nb_jobs_per_node()
    OccupancyDataProvider --> CoresOccupancyDTO : get_cores_occupancy()
    OccupancyDataProvider --> GroupedCoresOccupancyDTO : get_cores_occupancy_by_group()
    OccupancyDataProvider --> NodesOccupancyDTO : get_nodes_occupancy()
    OccupancyDataProvider --> GroupedNodesOccupancyDTO : get_nodes_occupancy_by_group()
    SlowdownDataProvider --> SlowdownDistributionDTO : get_slowdown_distribution()
    SlowdownDataProvider --> SlowdownStatsDTO : get_slowdown_stats()
    StateDataProvider --> JobsStatusDTO : get_jobs_status()
    StateDataProvider --> GroupedJobsStatusDTO : get_jobs_status_grouped()
    StateDataProvider --> JobsStatusYmsDTO : get_jobs_status_yms()
    StateDataProvider --> GroupedJobsStatusYmsDTO : get_jobs_status_yms_grouped()
    SubmitDateDataProvider --> SubmitHourDTO : get_submit_hour()
    SubmitDateDataProvider --> SubmitWeekdayDTO : get_submit_weekday()
    WaitTimeDataProvider --> WaitTimeDistributionDTO : get_wait_time_distribution()
    WaitTimeDataProvider --> WaitTimeStatsDTO : get_wait_time_stats()