diff --git a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java index 37084b0..f1f5e7e 100644 --- a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java +++ b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java @@ -100,16 +100,24 @@ public class JobRecoveryOnStartupService { modelTrainMetricsJobService.findTrainValidMetricCsvFiles(); } else { - // 3-3) 산출물이 부족하면 실패 처리(운영 정책에 따라 "유예"도 가능) + + // 3-3) 산출물이 부족하면 중단처리 + // 산출물이 부족하면 "실패 확정"이 아니라 "중단/보류"로 처리해서 + // 운영자가 재시작(재개)할 수 있게 한다. log.warn( - "[RECOVERY] outputs incomplete. mark FAILED. jobId={} reason={}", + "[RECOVERY] outputs incomplete. mark PAUSED/STOP for restart. jobId={} reason={}", job.getId(), out.reason()); - modelTrainJobCoreService.markFailed( - job.getId(), -1, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE"); - // model 상태 변경 - markStepErrorByJobType(job, out.reason()); + Integer modelId = job.getModelId() == null ? null : Math.toIntExact(job.getModelId()); + + // PAUSED/STOP + modelTrainJobCoreService.markPaused( + job.getId(), modelId, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE"); + + // 모델도 에러가 아니라 STOP으로 + markStepStopByJobType( + job, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE: " + out.reason()); } continue; }