diff --git a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMngCoreService.java b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMngCoreService.java index 9230022..8d6a7fa 100644 --- a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMngCoreService.java +++ b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMngCoreService.java @@ -430,6 +430,44 @@ public class ModelTrainMngCoreService { master.setUpdatedDttm(ZonedDateTime.now()); } + /** + * step1 정지 처리 + * + * @param modelId + * @param errorMessage + */ + public void markStep1Stop(Long modelId, String errorMessage) { + ModelMasterEntity master = + modelMngRepository + .findById(modelId) + .orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId)); + + master.setStatusCd(TrainStatusType.STOPPED.getId()); + master.setStep2State(TrainStatusType.STOPPED.getId()); + master.setLastError(errorMessage); + master.setUpdatedUid(userUtil.getId()); + master.setUpdatedDttm(ZonedDateTime.now()); + } + + /** + * step2 정지 처리 + * + * @param modelId + * @param errorMessage + */ + public void markStep2Stop(Long modelId, String errorMessage) { + ModelMasterEntity master = + modelMngRepository + .findById(modelId) + .orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId)); + + master.setStatusCd(TrainStatusType.STOPPED.getId()); + master.setStep2State(TrainStatusType.STOPPED.getId()); + master.setLastError(errorMessage); + master.setUpdatedUid(userUtil.getId()); + master.setUpdatedDttm(ZonedDateTime.now()); + } + @Transactional public void markSuccess(Long modelId) { ModelMasterEntity master = diff --git a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java index 113aa5a..cef50ab 100644 --- a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java +++ b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java @@ -93,6 +93,7 @@ public class JobRecoveryOnStartupService { if (out.completed()) { log.info("[RECOVERY] outputs look completed. mark SUCCESS. jobId={}", job.getId()); modelTrainJobCoreService.markSuccess(job.getId(), 0); + // model 상태 변경 markStepSuccessByJobType(job); } else { @@ -104,7 +105,7 @@ public class JobRecoveryOnStartupService { modelTrainJobCoreService.markFailed( job.getId(), -1, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE"); - + // model 상태 변경 markStepErrorByJobType(job, out.reason()); } continue; @@ -136,13 +137,17 @@ public class JobRecoveryOnStartupService { "[RECOVERY] container stopped (will be auto removed by --rm). container={}", containerName); - // 여기서 상태를 PAUSED로 바꿔도 되고 + // job 상태 변경 modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART"); + // model 상태 변경 + markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART"); } catch (Exception e) { log.error("[RECOVERY] docker stop failed. container={}", containerName, e); - + // job 상태 변경 modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART"); + // model 상태 변경 + markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART"); } continue; } @@ -156,6 +161,7 @@ public class JobRecoveryOnStartupService { if (exitCode != null && exitCode == 0) { log.info("[RECOVERY] container exited(0). mark SUCCESS. container={}", containerName); modelTrainJobCoreService.markSuccess(job.getId(), 0); + // model 상태 변경 markStepSuccessByJobType(job); } else { @@ -168,7 +174,7 @@ public class JobRecoveryOnStartupService { modelTrainJobCoreService.markFailed( job.getId(), exitCode, "SERVER_RESTART_CONTAINER_EXIT_NONZERO"); - + // model 상태 변경 markStepErrorByJobType(job, "exit=" + exitCode + " status=" + status); } @@ -180,7 +186,7 @@ public class JobRecoveryOnStartupService { modelTrainJobCoreService.markFailed( job.getId(), -1, "SERVER_RESTART_CONTAINER_INSPECT_ERROR"); - + // model 상태 변경 markStepErrorByJobType(job, "inspect-error"); } } @@ -206,6 +212,16 @@ public class JobRecoveryOnStartupService { * *
예: - jobType == "EVAL" → step2(평가 단계) 에러 - 그 외 → step1 혹은 전체 에러
*/
+ private void markStepStopByJobType(ModelTrainJobDto job, String msg) {
+ Map