From eb7680b95249204e15d03ab89b2aeaeb9909fbbc Mon Sep 17 00:00:00 2001 From: teddy Date: Wed, 4 Mar 2026 00:43:20 +0900 Subject: [PATCH] =?UTF-8?q?=EB=A6=AC=EC=BB=A4=EB=B2=84=EB=A6=AC=20?= =?UTF-8?q?=EC=88=98=EC=A0=95=20=ED=85=8C=EC=8A=A4=ED=8A=B8=20=EB=A1=9C?= =?UTF-8?q?=EA=B7=B8=20=EC=B6=94=EA=B0=80,?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../service/JobRecoveryOnStartupService.java | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java index f2510e5..61644d1 100644 --- a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java +++ b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java @@ -349,33 +349,66 @@ public class JobRecoveryOnStartupService { */ private OutputResult probeOutputs(ModelTrainJobDto job) { try { + + log.info( + "[RECOVERY] probeOutputs start. jobId={}, modelId={}", job.getId(), job.getModelId()); + + // 1) 출력 디렉토리 확인 Path outDir = resolveOutputDir(job); if (outDir == null || !Files.isDirectory(outDir)) { + log.warn("[RECOVERY] output directory missing. jobId={}, path={}", job.getId(), outDir); return new OutputResult(false, "output-dir-missing"); } + log.info("[RECOVERY] output directory found. jobId={}, path={}", job.getId(), outDir); + + // 2) totalEpoch 확인 Integer totalEpoch = extractTotalEpoch(job).orElse(null); if (totalEpoch == null || totalEpoch <= 0) { + log.warn( + "[RECOVERY] totalEpoch missing or invalid. jobId={}, totalEpoch={}", + job.getId(), + totalEpoch); return new OutputResult(false, "total-epoch-missing"); } + log.info("[RECOVERY] totalEpoch={}. jobId={}", totalEpoch, job.getId()); + + // 3) val.csv 존재 확인 Path valCsv = outDir.resolve("val.csv"); if (!Files.exists(valCsv)) { + log.warn("[RECOVERY] val.csv missing. jobId={}, path={}", job.getId(), valCsv); return new OutputResult(false, "val.csv-missing"); } + // 4) val.csv 라인 수 확인 long lines = countNonHeaderLines(valCsv); - // “같아야 완료” 정책 + log.info( + "[RECOVERY] val.csv lines counted. jobId={}, lines={}, expected={}", + job.getId(), + lines, + totalEpoch); + + // 5) 완료 판정 if (lines == totalEpoch) { + log.info("[RECOVERY] outputs look COMPLETE. jobId={}", job.getId()); return new OutputResult(true, "ok"); } + log.warn( + "[RECOVERY] val.csv line mismatch. jobId={}, lines={}, expected={}", + job.getId(), + lines, + totalEpoch); + return new OutputResult( false, "val.csv-lines-mismatch lines=" + lines + " expected=" + totalEpoch); } catch (Exception e) { + log.error("[RECOVERY] probeOutputs error. jobId={}", job.getId(), e); + return new OutputResult(false, "probe-error"); } }