리커버리 수정 #147
@@ -430,6 +430,44 @@ public class ModelTrainMngCoreService {
|
||||
master.setUpdatedDttm(ZonedDateTime.now());
|
||||
}
|
||||
|
||||
/**
|
||||
* step1 정지 처리
|
||||
*
|
||||
* @param modelId
|
||||
* @param errorMessage
|
||||
*/
|
||||
public void markStep1Stop(Long modelId, String errorMessage) {
|
||||
ModelMasterEntity master =
|
||||
modelMngRepository
|
||||
.findById(modelId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId));
|
||||
|
||||
master.setStatusCd(TrainStatusType.STOPPED.getId());
|
||||
master.setStep2State(TrainStatusType.STOPPED.getId());
|
||||
master.setLastError(errorMessage);
|
||||
master.setUpdatedUid(userUtil.getId());
|
||||
master.setUpdatedDttm(ZonedDateTime.now());
|
||||
}
|
||||
|
||||
/**
|
||||
* step2 정지 처리
|
||||
*
|
||||
* @param modelId
|
||||
* @param errorMessage
|
||||
*/
|
||||
public void markStep2Stop(Long modelId, String errorMessage) {
|
||||
ModelMasterEntity master =
|
||||
modelMngRepository
|
||||
.findById(modelId)
|
||||
.orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId));
|
||||
|
||||
master.setStatusCd(TrainStatusType.STOPPED.getId());
|
||||
master.setStep2State(TrainStatusType.STOPPED.getId());
|
||||
master.setLastError(errorMessage);
|
||||
master.setUpdatedUid(userUtil.getId());
|
||||
master.setUpdatedDttm(ZonedDateTime.now());
|
||||
}
|
||||
|
||||
@Transactional
|
||||
public void markSuccess(Long modelId) {
|
||||
ModelMasterEntity master =
|
||||
|
||||
@@ -93,6 +93,7 @@ public class JobRecoveryOnStartupService {
|
||||
if (out.completed()) {
|
||||
log.info("[RECOVERY] outputs look completed. mark SUCCESS. jobId={}", job.getId());
|
||||
modelTrainJobCoreService.markSuccess(job.getId(), 0);
|
||||
// model 상태 변경
|
||||
markStepSuccessByJobType(job);
|
||||
|
||||
} else {
|
||||
@@ -104,7 +105,7 @@ public class JobRecoveryOnStartupService {
|
||||
|
||||
modelTrainJobCoreService.markFailed(
|
||||
job.getId(), -1, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE");
|
||||
|
||||
// model 상태 변경
|
||||
markStepErrorByJobType(job, out.reason());
|
||||
}
|
||||
continue;
|
||||
@@ -136,13 +137,17 @@ public class JobRecoveryOnStartupService {
|
||||
"[RECOVERY] container stopped (will be auto removed by --rm). container={}",
|
||||
containerName);
|
||||
|
||||
// 여기서 상태를 PAUSED로 바꿔도 되고
|
||||
// job 상태 변경
|
||||
modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
||||
// model 상태 변경
|
||||
markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("[RECOVERY] docker stop failed. container={}", containerName, e);
|
||||
|
||||
// job 상태 변경
|
||||
modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
||||
// model 상태 변경
|
||||
markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -156,6 +161,7 @@ public class JobRecoveryOnStartupService {
|
||||
if (exitCode != null && exitCode == 0) {
|
||||
log.info("[RECOVERY] container exited(0). mark SUCCESS. container={}", containerName);
|
||||
modelTrainJobCoreService.markSuccess(job.getId(), 0);
|
||||
// model 상태 변경
|
||||
markStepSuccessByJobType(job);
|
||||
|
||||
} else {
|
||||
@@ -168,7 +174,7 @@ public class JobRecoveryOnStartupService {
|
||||
|
||||
modelTrainJobCoreService.markFailed(
|
||||
job.getId(), exitCode, "SERVER_RESTART_CONTAINER_EXIT_NONZERO");
|
||||
|
||||
// model 상태 변경
|
||||
markStepErrorByJobType(job, "exit=" + exitCode + " status=" + status);
|
||||
}
|
||||
|
||||
@@ -180,7 +186,7 @@ public class JobRecoveryOnStartupService {
|
||||
|
||||
modelTrainJobCoreService.markFailed(
|
||||
job.getId(), -1, "SERVER_RESTART_CONTAINER_INSPECT_ERROR");
|
||||
|
||||
// model 상태 변경
|
||||
markStepErrorByJobType(job, "inspect-error");
|
||||
}
|
||||
}
|
||||
@@ -206,6 +212,16 @@ public class JobRecoveryOnStartupService {
|
||||
*
|
||||
* <p>예: - jobType == "EVAL" → step2(평가 단계) 에러 - 그 외 → step1 혹은 전체 에러
|
||||
*/
|
||||
private void markStepStopByJobType(ModelTrainJobDto job, String msg) {
|
||||
Map<String, Object> params = job.getParamsJson();
|
||||
boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));
|
||||
if (isEval) {
|
||||
modelTrainMngCoreService.markStep2Stop(job.getModelId(), msg);
|
||||
} else {
|
||||
modelTrainMngCoreService.markStep1Stop(job.getModelId(), msg);
|
||||
}
|
||||
}
|
||||
|
||||
private void markStepErrorByJobType(ModelTrainJobDto job, String msg) {
|
||||
Map<String, Object> params = job.getParamsJson();
|
||||
boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));
|
||||
|
||||
Reference in New Issue
Block a user