diff --git a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java index f2510e5..61644d1 100644 --- a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java +++ b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java @@ -349,33 +349,66 @@ public class JobRecoveryOnStartupService { */ private OutputResult probeOutputs(ModelTrainJobDto job) { try { + + log.info( + "[RECOVERY] probeOutputs start. jobId={}, modelId={}", job.getId(), job.getModelId()); + + // 1) 출력 디렉토리 확인 Path outDir = resolveOutputDir(job); if (outDir == null || !Files.isDirectory(outDir)) { + log.warn("[RECOVERY] output directory missing. jobId={}, path={}", job.getId(), outDir); return new OutputResult(false, "output-dir-missing"); } + log.info("[RECOVERY] output directory found. jobId={}, path={}", job.getId(), outDir); + + // 2) totalEpoch 확인 Integer totalEpoch = extractTotalEpoch(job).orElse(null); if (totalEpoch == null || totalEpoch <= 0) { + log.warn( + "[RECOVERY] totalEpoch missing or invalid. jobId={}, totalEpoch={}", + job.getId(), + totalEpoch); return new OutputResult(false, "total-epoch-missing"); } + log.info("[RECOVERY] totalEpoch={}. jobId={}", totalEpoch, job.getId()); + + // 3) val.csv 존재 확인 Path valCsv = outDir.resolve("val.csv"); if (!Files.exists(valCsv)) { + log.warn("[RECOVERY] val.csv missing. jobId={}, path={}", job.getId(), valCsv); return new OutputResult(false, "val.csv-missing"); } + // 4) val.csv 라인 수 확인 long lines = countNonHeaderLines(valCsv); - // “같아야 완료” 정책 + log.info( + "[RECOVERY] val.csv lines counted. jobId={}, lines={}, expected={}", + job.getId(), + lines, + totalEpoch); + + // 5) 완료 판정 if (lines == totalEpoch) { + log.info("[RECOVERY] outputs look COMPLETE. jobId={}", job.getId()); return new OutputResult(true, "ok"); } + log.warn( + "[RECOVERY] val.csv line mismatch. jobId={}, lines={}, expected={}", + job.getId(), + lines, + totalEpoch); + return new OutputResult( false, "val.csv-lines-mismatch lines=" + lines + " expected=" + totalEpoch); } catch (Exception e) { + log.error("[RECOVERY] probeOutputs error. jobId={}", job.getId(), e); + return new OutputResult(false, "probe-error"); } }