학습 실패여부 확인 기능 추가

This commit is contained in:
2026-02-24 15:10:48 +09:00
parent 159fb281d4
commit 7c5f07683e
9 changed files with 197 additions and 12 deletions

View File

@@ -0,0 +1,76 @@
package com.kamco.cd.training.train.service;
import com.kamco.cd.training.postgres.core.ModelTrainJobCoreService;
import com.kamco.cd.training.postgres.core.ModelTrainMngCoreService;
import com.kamco.cd.training.train.dto.ModelTrainJobDto;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import lombok.RequiredArgsConstructor;
import lombok.extern.log4j.Log4j2;
import org.springframework.boot.context.event.ApplicationReadyEvent;
import org.springframework.context.event.EventListener;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
/** 실행중 학습이 있을때 처리 */
@Component
@RequiredArgsConstructor
@Log4j2
@Transactional(readOnly = true)
public class JobRecoveryOnStartupService {
private final ModelTrainJobCoreService modelTrainJobCoreService;
private final ModelTrainMngCoreService modelTrainMngCoreService;
@EventListener(ApplicationReadyEvent.class)
public void recover() {
// RUNNING 중인 학습이 있는지 조회
ModelTrainJobDto runningJobs = modelTrainJobCoreService.findRunningJobs();
if (runningJobs == null) {
return;
}
String containerName = runningJobs.getContainerName();
try {
boolean containerAlive = isContainerRunning(containerName);
if (containerAlive) {
// 컨테이너 살아있으면 → RUNNING 유지
log.info("[RECOVERY] container still running: {}", containerName);
} else {
// 컨테이너 죽었으면 → FAILED 처리
log.info("[RECOVERY] container not found. mark FAILED: {}", containerName);
modelTrainJobCoreService.markFailed(
runningJobs.getId(), null, "SERVER_RESTART_CONTAINER_NOT_FOUND");
}
} catch (IOException e) {
log.error("[RECOVERY] container check failed. mark FAILED: {}", containerName, e);
modelTrainJobCoreService.markFailed(
runningJobs.getId(), null, "SERVER_RESTART_CONTAINER_CHECK_ERROR");
}
}
/**
* docker 실행중인지 확인하기
*
* @param containerName container name
* @return true, false
* @throws IOException
*/
private boolean isContainerRunning(String containerName) throws IOException {
ProcessBuilder pb =
new ProcessBuilder("docker", "inspect", "-f", "{{.State.Running}}", containerName);
Process p = pb.start();
BufferedReader br = new BufferedReader(new InputStreamReader(p.getInputStream()));
String line = br.readLine();
return "true".equals(line);
}
}

View File

@@ -30,10 +30,12 @@ public class TrainJobWorker {
private final DockerTrainService dockerTrainService;
private final ObjectMapper objectMapper;
@Async
@Async("trainJobExecutor")
@TransactionalEventListener(phase = TransactionPhase.AFTER_COMMIT)
public void handle(ModelTrainJobQueuedEvent event) {
log.info("[JOB] thread={}, jobId={}", Thread.currentThread().getName(), event.getJobId());
Long jobId = event.getJobId();
ModelTrainJobDto job =
@@ -89,7 +91,6 @@ public class TrainJobWorker {
// 도커 실행 후 로그 수집
result = dockerTrainService.runEvalSync(containerName, evalReq);
} else {
// step1 진행중 처리
modelTrainMngCoreService.markStep1InProgress(modelId, jobId);