리커버리 수정 #147

Merged
teddy merged 1 commits from feat/training_260303 into develop 2026-03-04 00:06:22 +09:00
2 changed files with 59 additions and 5 deletions

View File

@@ -430,6 +430,44 @@ public class ModelTrainMngCoreService {
master.setUpdatedDttm(ZonedDateTime.now());
}
/**
* step1 정지 처리
*
* @param modelId
* @param errorMessage
*/
public void markStep1Stop(Long modelId, String errorMessage) {
ModelMasterEntity master =
modelMngRepository
.findById(modelId)
.orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId));
master.setStatusCd(TrainStatusType.STOPPED.getId());
master.setStep2State(TrainStatusType.STOPPED.getId());
master.setLastError(errorMessage);
master.setUpdatedUid(userUtil.getId());
master.setUpdatedDttm(ZonedDateTime.now());
}
/**
* step2 정지 처리
*
* @param modelId
* @param errorMessage
*/
public void markStep2Stop(Long modelId, String errorMessage) {
ModelMasterEntity master =
modelMngRepository
.findById(modelId)
.orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId));
master.setStatusCd(TrainStatusType.STOPPED.getId());
master.setStep2State(TrainStatusType.STOPPED.getId());
master.setLastError(errorMessage);
master.setUpdatedUid(userUtil.getId());
master.setUpdatedDttm(ZonedDateTime.now());
}
@Transactional
public void markSuccess(Long modelId) {
ModelMasterEntity master =

View File

@@ -93,6 +93,7 @@ public class JobRecoveryOnStartupService {
if (out.completed()) {
log.info("[RECOVERY] outputs look completed. mark SUCCESS. jobId={}", job.getId());
modelTrainJobCoreService.markSuccess(job.getId(), 0);
// model 상태 변경
markStepSuccessByJobType(job);
} else {
@@ -104,7 +105,7 @@ public class JobRecoveryOnStartupService {
modelTrainJobCoreService.markFailed(
job.getId(), -1, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE");
// model 상태 변경
markStepErrorByJobType(job, out.reason());
}
continue;
@@ -136,13 +137,17 @@ public class JobRecoveryOnStartupService {
"[RECOVERY] container stopped (will be auto removed by --rm). container={}",
containerName);
// 여기서 상태를 PAUSED로 바꿔도 되고
// job 상태 변경
modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
// model 상태 변경
markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
} catch (Exception e) {
log.error("[RECOVERY] docker stop failed. container={}", containerName, e);
// job 상태 변경
modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
// model 상태 변경
markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
}
continue;
}
@@ -156,6 +161,7 @@ public class JobRecoveryOnStartupService {
if (exitCode != null && exitCode == 0) {
log.info("[RECOVERY] container exited(0). mark SUCCESS. container={}", containerName);
modelTrainJobCoreService.markSuccess(job.getId(), 0);
// model 상태 변경
markStepSuccessByJobType(job);
} else {
@@ -168,7 +174,7 @@ public class JobRecoveryOnStartupService {
modelTrainJobCoreService.markFailed(
job.getId(), exitCode, "SERVER_RESTART_CONTAINER_EXIT_NONZERO");
// model 상태 변경
markStepErrorByJobType(job, "exit=" + exitCode + " status=" + status);
}
@@ -180,7 +186,7 @@ public class JobRecoveryOnStartupService {
modelTrainJobCoreService.markFailed(
job.getId(), -1, "SERVER_RESTART_CONTAINER_INSPECT_ERROR");
// model 상태 변경
markStepErrorByJobType(job, "inspect-error");
}
}
@@ -206,6 +212,16 @@ public class JobRecoveryOnStartupService {
*
* <p>예: - jobType == "EVAL" → step2(평가 단계) 에러 - 그 외 → step1 혹은 전체 에러
*/
private void markStepStopByJobType(ModelTrainJobDto job, String msg) {
Map<String, Object> params = job.getParamsJson();
boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));
if (isEval) {
modelTrainMngCoreService.markStep2Stop(job.getModelId(), msg);
} else {
modelTrainMngCoreService.markStep1Stop(job.getModelId(), msg);
}
}
private void markStepErrorByJobType(ModelTrainJobDto job, String msg) {
Map<String, Object> params = job.getParamsJson();
boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));