From 693de354d2ebc5958e03cbcdab2174a4092bab2b Mon Sep 17 00:00:00 2001 From: teddy Date: Wed, 4 Mar 2026 00:06:04 +0900 Subject: [PATCH] =?UTF-8?q?=EB=A6=AC=EC=BB=A4=EB=B2=84=EB=A6=AC=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/ModelTrainMngCoreService.java | 38 +++++++++++++++++++ .../service/JobRecoveryOnStartupService.java | 26 ++++++++++--- 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMngCoreService.java b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMngCoreService.java index 9230022..8d6a7fa 100644 --- a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMngCoreService.java +++ b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMngCoreService.java @@ -430,6 +430,44 @@ public class ModelTrainMngCoreService { master.setUpdatedDttm(ZonedDateTime.now()); } + /** + * step1 정지 처리 + * + * @param modelId + * @param errorMessage + */ + public void markStep1Stop(Long modelId, String errorMessage) { + ModelMasterEntity master = + modelMngRepository + .findById(modelId) + .orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId)); + + master.setStatusCd(TrainStatusType.STOPPED.getId()); + master.setStep2State(TrainStatusType.STOPPED.getId()); + master.setLastError(errorMessage); + master.setUpdatedUid(userUtil.getId()); + master.setUpdatedDttm(ZonedDateTime.now()); + } + + /** + * step2 정지 처리 + * + * @param modelId + * @param errorMessage + */ + public void markStep2Stop(Long modelId, String errorMessage) { + ModelMasterEntity master = + modelMngRepository + .findById(modelId) + .orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId)); + + master.setStatusCd(TrainStatusType.STOPPED.getId()); + master.setStep2State(TrainStatusType.STOPPED.getId()); + master.setLastError(errorMessage); + master.setUpdatedUid(userUtil.getId()); + master.setUpdatedDttm(ZonedDateTime.now()); + } + @Transactional public void markSuccess(Long modelId) { ModelMasterEntity master = diff --git a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java index 113aa5a..cef50ab 100644 --- a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java +++ b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java @@ -93,6 +93,7 @@ public class JobRecoveryOnStartupService { if (out.completed()) { log.info("[RECOVERY] outputs look completed. mark SUCCESS. jobId={}", job.getId()); modelTrainJobCoreService.markSuccess(job.getId(), 0); + // model 상태 변경 markStepSuccessByJobType(job); } else { @@ -104,7 +105,7 @@ public class JobRecoveryOnStartupService { modelTrainJobCoreService.markFailed( job.getId(), -1, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE"); - + // model 상태 변경 markStepErrorByJobType(job, out.reason()); } continue; @@ -136,13 +137,17 @@ public class JobRecoveryOnStartupService { "[RECOVERY] container stopped (will be auto removed by --rm). container={}", containerName); - // 여기서 상태를 PAUSED로 바꿔도 되고 + // job 상태 변경 modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART"); + // model 상태 변경 + markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART"); } catch (Exception e) { log.error("[RECOVERY] docker stop failed. container={}", containerName, e); - + // job 상태 변경 modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART"); + // model 상태 변경 + markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART"); } continue; } @@ -156,6 +161,7 @@ public class JobRecoveryOnStartupService { if (exitCode != null && exitCode == 0) { log.info("[RECOVERY] container exited(0). mark SUCCESS. container={}", containerName); modelTrainJobCoreService.markSuccess(job.getId(), 0); + // model 상태 변경 markStepSuccessByJobType(job); } else { @@ -168,7 +174,7 @@ public class JobRecoveryOnStartupService { modelTrainJobCoreService.markFailed( job.getId(), exitCode, "SERVER_RESTART_CONTAINER_EXIT_NONZERO"); - + // model 상태 변경 markStepErrorByJobType(job, "exit=" + exitCode + " status=" + status); } @@ -180,7 +186,7 @@ public class JobRecoveryOnStartupService { modelTrainJobCoreService.markFailed( job.getId(), -1, "SERVER_RESTART_CONTAINER_INSPECT_ERROR"); - + // model 상태 변경 markStepErrorByJobType(job, "inspect-error"); } } @@ -206,6 +212,16 @@ public class JobRecoveryOnStartupService { * *

예: - jobType == "EVAL" → step2(평가 단계) 에러 - 그 외 → step1 혹은 전체 에러 */ + private void markStepStopByJobType(ModelTrainJobDto job, String msg) { + Map params = job.getParamsJson(); + boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType"))); + if (isEval) { + modelTrainMngCoreService.markStep2Stop(job.getModelId(), msg); + } else { + modelTrainMngCoreService.markStep1Stop(job.getModelId(), msg); + } + } + private void markStepErrorByJobType(ModelTrainJobDto job, String msg) { Map params = job.getParamsJson(); boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));