Source code for scitex_resource._specs._metrics

"""Flat, machine-readable resource metrics for monitoring/heartbeat use.

Companion to ``get_specs()`` (which is rich + human-formatted). ``get_metrics()``
returns a flat dict of integers, floats and short strings — the shape any hub,
dashboard, or heartbeat producer can ship over the wire without reshaping.

Cross-platform via ``psutil`` (Linux / macOS / Windows / WSL). Container-aware:
inside Docker / cgroups, ``psutil.virtual_memory()`` reports the cgroup limit,
not the host kernel's view, so the numbers reflect what the process can
actually use.

Schema (treat as a public contract; bump minor on rename):

    cpu_count          int     logical CPU count (psutil.cpu_count())
    cpu_model          str     human-readable CPU model name; "" if unknown
    load_avg_1m/5m/15m float   POSIX load averages; psutil emulates on Windows
    mem_total_mb       int     RAM total in MiB
    mem_used_mb        int     "used" excluding cache/buffers (psutil's notion)
    mem_free_mb        int     psutil's available — what apps can grab now
    mem_used_percent   float   psutil.virtual_memory().percent
    disk_total_mb      int     home-directory partition total in MiB
    disk_used_mb       int     home-directory partition used in MiB
    disk_used_percent  float   home-directory partition percent
    gpus               list    [{"name", "vram_total_mb", "vram_used_mb"}, ...]
                               empty list when no NVIDIA GPU / nvidia-smi missing

The ``gpu=False`` flag skips the ~200 ms ``nvidia-smi`` shellout for hot paths
that don't need GPU info (e.g. 30 s heartbeats on GPU-less hosts).
"""

from __future__ import annotations

import logging
import os
import shutil
import subprocess
from typing import Any

import psutil as _psutil

log = logging.getLogger(__name__)



[docs]
def get_metrics(gpu: bool = True) -> dict[str, Any]:
    """Return a flat dict of current system metrics suitable for heartbeats.

    Parameters
    ----------
    gpu : bool
        When ``True`` (default) probe NVIDIA GPUs via ``nvidia-smi``. Set to
        ``False`` on hot paths to skip the ~200 ms shellout when you know
        there's no GPU or when you cache GPU info separately.

    Returns
    -------
    dict
        See module docstring for the full key list and contract.
    """
    metrics: dict[str, Any] = {}

    metrics["cpu_count"] = _psutil.cpu_count(logical=True) or 0
    metrics["cpu_model"] = _cpu_model()

    load1, load5, load15 = _load_avg()
    metrics["load_avg_1m"] = load1
    metrics["load_avg_5m"] = load5
    metrics["load_avg_15m"] = load15

    vm = _psutil.virtual_memory()
    metrics["mem_total_mb"] = int(vm.total // (1024 * 1024))
    metrics["mem_used_mb"] = int((vm.total - vm.available) // (1024 * 1024))
    metrics["mem_free_mb"] = int(vm.available // (1024 * 1024))
    metrics["mem_used_percent"] = round(float(vm.percent), 1)

    try:
        du = _psutil.disk_usage(os.path.expanduser("~"))
        metrics["disk_total_mb"] = int(du.total // (1024 * 1024))
        metrics["disk_used_mb"] = int(du.used // (1024 * 1024))
        metrics["disk_used_percent"] = round(float(du.percent), 1)
    except OSError:
        metrics["disk_total_mb"] = 0
        metrics["disk_used_mb"] = 0
        metrics["disk_used_percent"] = 0.0

    metrics["gpus"] = _nvidia_gpus() if gpu else []

    return metrics



def _cpu_model() -> str:
    """Cross-platform CPU model string. Empty string if undetectable."""
    import platform

    if platform.system() == "Linux":
        try:
            with open("/proc/cpuinfo") as f:
                for line in f:
                    if line.startswith("model name"):
                        return line.split(":", 1)[1].strip()
        except OSError:
            pass
    if platform.system() == "Darwin":
        try:
            out = subprocess.check_output(
                ["sysctl", "-n", "machdep.cpu.brand_string"],
                text=True,
                timeout=1,
            )
            return out.strip()
        except (OSError, subprocess.SubprocessError):
            pass
    return platform.processor() or ""


def _load_avg() -> tuple[float, float, float]:
    """POSIX load averages. psutil emulates on Windows from CPU samples."""
    try:
        load1, load5, load15 = _psutil.getloadavg()
        return round(float(load1), 2), round(float(load5), 2), round(float(load15), 2)
    except (OSError, AttributeError):
        return 0.0, 0.0, 0.0


def _nvidia_gpus() -> list[dict[str, Any]]:
    """List of NVIDIA GPUs via nvidia-smi. Empty list if not available."""
    if not shutil.which("nvidia-smi"):
        return []
    try:
        out = subprocess.check_output(
            [
                "nvidia-smi",
                "--query-gpu=name,memory.total,memory.used",
                "--format=csv,noheader,nounits",
            ],
            text=True,
            timeout=3,
        )
    except (OSError, subprocess.SubprocessError):
        return []
    gpus: list[dict[str, Any]] = []
    for line in out.strip().splitlines():
        parts = [p.strip() for p in line.split(",")]
        if len(parts) < 3:
            continue
        try:
            gpus.append(
                {
                    "name": parts[0],
                    "vram_total_mb": int(parts[1]),
                    "vram_used_mb": int(parts[2]),
                }
            )
        except ValueError:
            continue
    return gpus