diff --git a/src/main/java/com/kamco/cd/training/common/dto/MonitorDto.java b/src/main/java/com/kamco/cd/training/common/dto/MonitorDto.java new file mode 100644 index 0000000..ccd49ae --- /dev/null +++ b/src/main/java/com/kamco/cd/training/common/dto/MonitorDto.java @@ -0,0 +1,21 @@ +package com.kamco.cd.training.common.dto; + +import java.util.ArrayList; +import java.util.List; + +public class MonitorDto { + + public int cpu; // 30초 평균 (%) + public String memory; // "3.2/16GB" + public List gpus = new ArrayList<>(); + + public static class Gpu { + public int index; + public int util; + + public Gpu(int index, int util) { + this.index = index; + this.util = util; + } + } +} diff --git a/src/main/java/com/kamco/cd/training/common/service/GpuDmonReader.java b/src/main/java/com/kamco/cd/training/common/service/GpuDmonReader.java new file mode 100644 index 0000000..c16f837 --- /dev/null +++ b/src/main/java/com/kamco/cd/training/common/service/GpuDmonReader.java @@ -0,0 +1,158 @@ +package com.kamco.cd.training.common.service; + +import jakarta.annotation.PostConstruct; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import lombok.extern.log4j.Log4j2; +import org.springframework.stereotype.Component; + +@Component +@Log4j2 +public class GpuDmonReader { + + // ========================= + // GPU 사용률 저장소 + // - key: GPU index (0,1,2...) + // - value: 현재 GPU 사용률 (%) + // - ConcurrentHashMap → 멀티스레드 안전 + // ========================= + private final Map gpuUtilMap = new ConcurrentHashMap<>(); + + // ========================= + // nvidia-smi dmon 프로세스 + // - 스트리밍으로 GPU 상태를 계속 받아옴 + // ========================= + private volatile Process process; + + // ========================= + // 외부에서 GPU 사용률 조회 + // ========================= + public Map getGpuUtilMap() { + return gpuUtilMap; + } + + // ========================= + // Bean 초기화 시 실행 + // - 별도 스레드에서 dmon 실행 + // - 메인 스레드 block 방지 + // ========================= + @PostConstruct + public void start() { + + if (!isNvidiaAvailable()) { + log.warn("nvidia-smi not found. GPU monitoring disabled."); + return; + } + + Thread t = new Thread(this::runWithRestart, "gpu-dmon-thread"); + t.setDaemon(true); // 서버 종료 시 같이 종료 + t.start(); + } + + // ========================= + // dmon 실행 + 자동 재시작 루프 + // - dmon이 죽어도 계속 재시작 + // ========================= + private void runWithRestart() { + boolean firstError = true; + + while (true) { + try { + log.info("Starting nvidia-smi dmon..."); + runDmon(); + firstError = true; // 정상 실행되면 초기화 + + } catch (Exception e) { + + if (firstError) { + log.error("nvidia-smi not available. GPU monitoring disabled.", e); + firstError = false; + } else { + log.warn("dmon retry..."); + } + } + + try { + Thread.sleep(5000); // 5초 후에 시작 + } catch (InterruptedException ignored) {} + } + } + + // ========================= + // nvidia-smi dmon 실행 및 출력 파싱 + // ========================= + private void runDmon() throws Exception { + + // GPU utilization만 출력 (-s u) + ProcessBuilder pb = new ProcessBuilder( + "nvidia-smi", "dmon", "-s", "u" + ); + + process = pb.start(); + + // dmon은 stdout으로 계속 데이터를 뿌림 (스트리밍) + try (BufferedReader br = new BufferedReader( + new InputStreamReader(process.getInputStream())) + ) { + String line; + + while ((line = br.readLine()) != null) { + + // 헤더 라인 제거 (# 으로 시작) + if (line.startsWith("#")) continue; + + // 공백 기준 분리 + String[] parts = line.trim().split("\\s+"); + + // 최소 index, util 있어야 함 + if (parts.length < 3) continue; + + // GPU index (0,1,2...) + int index = Integer.parseInt(parts[0]); + + // GPU 사용률 (%) + int util = Integer.parseInt(parts[1]); + + // 최신 값으로 덮어쓰기 + gpuUtilMap.put(index, util); + } + } + + // 여기 도달했다는 건 dmon 프로세스가 종료된 상태 + int exitCode = process.waitFor(); + log.warn("dmon exited. code={}", exitCode); + + // 상위 루프에서 재시작하도록 예외 발생 + throw new IllegalStateException("dmon stopped"); + } + + // ========================= + // dmon 프로세스 살아있는지 확인 + // ========================= + public boolean isAlive() { + return process != null && process.isAlive(); + } + + // ========================= + // dmon 강제 재시작 + // - 기존 프로세스 종료 → runWithRestart에서 자동 재시작 + // ========================= + public void restart() { + try { + if (process != null && process.isAlive()) { + process.destroy(); + } + } catch (Exception ignored) {} + } + + private boolean isNvidiaAvailable() { + try { + Process p = new ProcessBuilder("which", "nvidia-smi").start(); + return p.waitFor() == 0; + } catch (Exception e) { + return false; + } + } +} diff --git a/src/main/java/com/kamco/cd/training/common/service/SystemMonitorService.java b/src/main/java/com/kamco/cd/training/common/service/SystemMonitorService.java new file mode 100644 index 0000000..a114eb5 --- /dev/null +++ b/src/main/java/com/kamco/cd/training/common/service/SystemMonitorService.java @@ -0,0 +1,226 @@ +package com.kamco.cd.training.common.service; + +import com.kamco.cd.training.common.dto.MonitorDto; +import java.io.BufferedReader; +import java.io.FileReader; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import lombok.extern.log4j.Log4j2; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +@Log4j2 +public class SystemMonitorService { + + // ========================= + // CPU 이전값 (누적값 → delta 계산용) + // ========================= + private long prevIdle = 0; + private long prevTotal = 0; + + // ========================= + // 최근 30초 히스토리 + // - CPU: 30개 (1초 * 30) + // - GPU: GPU별 30개 + // ========================= + private final Deque cpuHistory = new ArrayDeque<>(); + private final Map> gpuHistory = new java.util.HashMap<>(); + + // ========================= + // GPU dmon reader (스트리밍) + // ========================= + private final GpuDmonReader gpuReader; + + // ========================= + // 캐시 (API 응답용) + // - 매 요청마다 계산하지 않기 위해 사용 + // - volatile로 동시성 안전 보장 + // ========================= + private volatile MonitorDto cached = new MonitorDto(); + + // ========================= + // 1초마다 실행되는 수집 스케줄러 + // ========================= + @Scheduled(fixedRate = 1000) + public void collect() { + try { + + // ===================== + // 1. CPU 수집 + // ===================== + double cpu = readCpu(); + + cpuHistory.add(cpu); + + // 30개 유지 (rolling window) + if (cpuHistory.size() > 30) cpuHistory.poll(); + + // ===================== + // 2. GPU (dmon 데이터 사용) + // ===================== + Map gpuMap = gpuReader.getGpuUtilMap(); + + for (Map.Entry entry : gpuMap.entrySet()) { + int index = entry.getKey(); + int util = entry.getValue(); + + gpuHistory + .computeIfAbsent(index, k -> new ArrayDeque<>()) + .add(util); + + Deque q = gpuHistory.get(index); + if (q.size() > 30) q.poll(); + } + + // ===================== + // 3. 캐시 업데이트 + // ===================== + updateCache(gpuMap); + + } catch (Exception e) { + log.error("collect error", e); + } + } + + // ========================= + // CPU 사용률 계산 (/proc/stat) + // ========================= + private double readCpu() throws Exception { + + // linux 환경일때만 실행 + if (!isLinux()) { + return 0; // 또는 -1 + } + + BufferedReader br = new BufferedReader(new FileReader("/proc/stat")); + String line = br.readLine(); // "cpu ..." 라인 + br.close(); + + String[] p = line.split("\\s+"); + + long user = Long.parseLong(p[1]); + long nice = Long.parseLong(p[2]); + long system = Long.parseLong(p[3]); + long idle = Long.parseLong(p[4]); + long iowait = Long.parseLong(p[5]); + long irq = Long.parseLong(p[6]); + long softirq = Long.parseLong(p[7]); + + // 전체 시간 (누적) + long total = user + nice + system + idle + iowait + irq + softirq; + + // idle 시간 + long idleAll = idle + iowait; + + // 최초 호출 (이전값 없음) + if (prevTotal == 0) { + prevTotal = total; + prevIdle = idleAll; + return 0; + } + + // 이전 대비 변화량 + long totalDiff = total - prevTotal; + long idleDiff = idleAll - prevIdle; + + // 상태 업데이트 + prevTotal = total; + prevIdle = idleAll; + + // CPU 사용률 계산 + return (1.0 - (double) idleDiff / totalDiff) * 100; + } + + private boolean isLinux() { + return System.getProperty("os.name").toLowerCase().contains("linux"); + } + + // ========================= + // Memory (/proc/meminfo) + // - 현재값 (평균 아님) + // ========================= + private String readMemory() throws Exception { + + // linux 환경일때만 실행 + if (!isLinux()) { + return "N/A"; + } + + BufferedReader br = new BufferedReader(new FileReader("/proc/meminfo")); + + long total = 0; + long available = 0; + + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("MemTotal")) { + total = Long.parseLong(line.replaceAll("\\D+", "")); + } else if (line.startsWith("MemAvailable")) { + available = Long.parseLong(line.replaceAll("\\D+", "")); + } + } + + br.close(); + + long used = total - available; + + // kB → GB + double usedGB = used / (1024.0 * 1024); + double totalGB = total / (1024.0 * 1024); + + return String.format("%.1f/%.0fGB", usedGB, totalGB); + } + + // ========================= + // 캐시 업데이트 + // ========================= + private void updateCache(Map gpuMap) throws Exception { + + MonitorDto dto = new MonitorDto(); + + // ===================== + // CPU 평균 (30초) + // ===================== + dto.cpu = (int) cpuHistory.stream() + .mapToDouble(Double::doubleValue) + .average() + .orElse(0); + + // ===================== + // Memory (현재값) + // ===================== + dto.memory = readMemory(); + + // ===================== + // GPU 평균 (30초) + // ===================== + for (Map.Entry entry : gpuMap.entrySet()) { + + int index = entry.getKey(); + + Deque q = gpuHistory.get(index); + + int avg = (int) (q == null ? 0 : + q.stream().mapToInt(i -> i).average().orElse(0) + ); + + dto.gpus.add(new MonitorDto.Gpu(index, avg)); + } + + // ===================== + // 캐시 교체 (atomic) + // ===================== + this.cached = dto; + } + + // ========================= + // 외부 조회 (Controller에서 호출) + // ========================= + public MonitorDto get() { + return cached; + } +} diff --git a/src/main/java/com/kamco/cd/training/model/ModelTrainMngApiController.java b/src/main/java/com/kamco/cd/training/model/ModelTrainMngApiController.java index c6f0043..c3b8133 100644 --- a/src/main/java/com/kamco/cd/training/model/ModelTrainMngApiController.java +++ b/src/main/java/com/kamco/cd/training/model/ModelTrainMngApiController.java @@ -1,5 +1,7 @@ package com.kamco.cd.training.model; +import com.kamco.cd.training.common.dto.MonitorDto; +import com.kamco.cd.training.common.service.SystemMonitorService; import com.kamco.cd.training.config.api.ApiResponseDto; import com.kamco.cd.training.dataset.dto.DatasetDto; import com.kamco.cd.training.dataset.dto.DatasetDto.DatasetReq; @@ -40,6 +42,7 @@ public class ModelTrainMngApiController { private final ModelTrainMngService modelTrainMngService; private final ModelTrainMetricsJobService modelTrainMetricsJobService; private final ModelTestMetricsJobService modelTestMetricsJobService; + private final SystemMonitorService systemMonitorService; @Operation(summary = "모델학습 목록 조회", description = "모델학습 목록 조회 API") @ApiResponses( @@ -214,4 +217,22 @@ public class ModelTrainMngApiController { modelTestMetricsJobService.findTestValidMetricCsvFiles(); return ApiResponseDto.ok(null); } + + @Operation(summary = "학습서버 시스템 사용율 조회", description = "cpu, gpu, memory 사용율 조회") + @ApiResponses( + value = { + @ApiResponse( + responseCode = "200", + description = "검색 성공", + content = + @Content( + mediaType = "application/json", + schema = @Schema(implementation = Long.class))), + @ApiResponse(responseCode = "400", description = "잘못된 검색 조건", content = @Content), + @ApiResponse(responseCode = "500", description = "서버 오류", content = @Content) + }) + @GetMapping("/monitor") + public ApiResponseDto getSystem() throws IOException { + return ApiResponseDto.ok(systemMonitorService.get()); + } }