Merge pull request '리커버리 수정' (#147) from feat/training_260303 into develop

Reviewed-on: #147
This commit was merged in pull request #147.
This commit is contained in:
2026-03-04 00:06:21 +09:00
2 changed files with 59 additions and 5 deletions

View File

@@ -430,6 +430,44 @@ public class ModelTrainMngCoreService {
master.setUpdatedDttm(ZonedDateTime.now()); master.setUpdatedDttm(ZonedDateTime.now());
} }
/**
* step1 정지 처리
*
* @param modelId
* @param errorMessage
*/
public void markStep1Stop(Long modelId, String errorMessage) {
ModelMasterEntity master =
modelMngRepository
.findById(modelId)
.orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId));
master.setStatusCd(TrainStatusType.STOPPED.getId());
master.setStep2State(TrainStatusType.STOPPED.getId());
master.setLastError(errorMessage);
master.setUpdatedUid(userUtil.getId());
master.setUpdatedDttm(ZonedDateTime.now());
}
/**
* step2 정지 처리
*
* @param modelId
* @param errorMessage
*/
public void markStep2Stop(Long modelId, String errorMessage) {
ModelMasterEntity master =
modelMngRepository
.findById(modelId)
.orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId));
master.setStatusCd(TrainStatusType.STOPPED.getId());
master.setStep2State(TrainStatusType.STOPPED.getId());
master.setLastError(errorMessage);
master.setUpdatedUid(userUtil.getId());
master.setUpdatedDttm(ZonedDateTime.now());
}
@Transactional @Transactional
public void markSuccess(Long modelId) { public void markSuccess(Long modelId) {
ModelMasterEntity master = ModelMasterEntity master =

View File

@@ -93,6 +93,7 @@ public class JobRecoveryOnStartupService {
if (out.completed()) { if (out.completed()) {
log.info("[RECOVERY] outputs look completed. mark SUCCESS. jobId={}", job.getId()); log.info("[RECOVERY] outputs look completed. mark SUCCESS. jobId={}", job.getId());
modelTrainJobCoreService.markSuccess(job.getId(), 0); modelTrainJobCoreService.markSuccess(job.getId(), 0);
// model 상태 변경
markStepSuccessByJobType(job); markStepSuccessByJobType(job);
} else { } else {
@@ -104,7 +105,7 @@ public class JobRecoveryOnStartupService {
modelTrainJobCoreService.markFailed( modelTrainJobCoreService.markFailed(
job.getId(), -1, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE"); job.getId(), -1, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE");
// model 상태 변경
markStepErrorByJobType(job, out.reason()); markStepErrorByJobType(job, out.reason());
} }
continue; continue;
@@ -136,13 +137,17 @@ public class JobRecoveryOnStartupService {
"[RECOVERY] container stopped (will be auto removed by --rm). container={}", "[RECOVERY] container stopped (will be auto removed by --rm). container={}",
containerName); containerName);
// 여기서 상태를 PAUSED로 바꿔도 되고 // job 상태 변경
modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART"); modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
// model 상태 변경
markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
} catch (Exception e) { } catch (Exception e) {
log.error("[RECOVERY] docker stop failed. container={}", containerName, e); log.error("[RECOVERY] docker stop failed. container={}", containerName, e);
// job 상태 변경
modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART"); modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
// model 상태 변경
markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
} }
continue; continue;
} }
@@ -156,6 +161,7 @@ public class JobRecoveryOnStartupService {
if (exitCode != null && exitCode == 0) { if (exitCode != null && exitCode == 0) {
log.info("[RECOVERY] container exited(0). mark SUCCESS. container={}", containerName); log.info("[RECOVERY] container exited(0). mark SUCCESS. container={}", containerName);
modelTrainJobCoreService.markSuccess(job.getId(), 0); modelTrainJobCoreService.markSuccess(job.getId(), 0);
// model 상태 변경
markStepSuccessByJobType(job); markStepSuccessByJobType(job);
} else { } else {
@@ -168,7 +174,7 @@ public class JobRecoveryOnStartupService {
modelTrainJobCoreService.markFailed( modelTrainJobCoreService.markFailed(
job.getId(), exitCode, "SERVER_RESTART_CONTAINER_EXIT_NONZERO"); job.getId(), exitCode, "SERVER_RESTART_CONTAINER_EXIT_NONZERO");
// model 상태 변경
markStepErrorByJobType(job, "exit=" + exitCode + " status=" + status); markStepErrorByJobType(job, "exit=" + exitCode + " status=" + status);
} }
@@ -180,7 +186,7 @@ public class JobRecoveryOnStartupService {
modelTrainJobCoreService.markFailed( modelTrainJobCoreService.markFailed(
job.getId(), -1, "SERVER_RESTART_CONTAINER_INSPECT_ERROR"); job.getId(), -1, "SERVER_RESTART_CONTAINER_INSPECT_ERROR");
// model 상태 변경
markStepErrorByJobType(job, "inspect-error"); markStepErrorByJobType(job, "inspect-error");
} }
} }
@@ -206,6 +212,16 @@ public class JobRecoveryOnStartupService {
* *
* <p>예: - jobType == "EVAL" → step2(평가 단계) 에러 - 그 외 → step1 혹은 전체 에러 * <p>예: - jobType == "EVAL" → step2(평가 단계) 에러 - 그 외 → step1 혹은 전체 에러
*/ */
private void markStepStopByJobType(ModelTrainJobDto job, String msg) {
Map<String, Object> params = job.getParamsJson();
boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));
if (isEval) {
modelTrainMngCoreService.markStep2Stop(job.getModelId(), msg);
} else {
modelTrainMngCoreService.markStep1Stop(job.getModelId(), msg);
}
}
private void markStepErrorByJobType(ModelTrainJobDto job, String msg) { private void markStepErrorByJobType(ModelTrainJobDto job, String msg) {
Map<String, Object> params = job.getParamsJson(); Map<String, Object> params = job.getParamsJson();
boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType"))); boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));