학습 실패여부 확인 기능 추가
This commit is contained in:
@@ -0,0 +1,76 @@
|
||||
package com.kamco.cd.training.train.service;
|
||||
|
||||
import com.kamco.cd.training.postgres.core.ModelTrainJobCoreService;
|
||||
import com.kamco.cd.training.postgres.core.ModelTrainMngCoreService;
|
||||
import com.kamco.cd.training.train.dto.ModelTrainJobDto;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.springframework.boot.context.event.ApplicationReadyEvent;
|
||||
import org.springframework.context.event.EventListener;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
/** 실행중 학습이 있을때 처리 */
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@Log4j2
|
||||
@Transactional(readOnly = true)
|
||||
public class JobRecoveryOnStartupService {
|
||||
private final ModelTrainJobCoreService modelTrainJobCoreService;
|
||||
private final ModelTrainMngCoreService modelTrainMngCoreService;
|
||||
|
||||
@EventListener(ApplicationReadyEvent.class)
|
||||
public void recover() {
|
||||
// RUNNING 중인 학습이 있는지 조회
|
||||
ModelTrainJobDto runningJobs = modelTrainJobCoreService.findRunningJobs();
|
||||
|
||||
if (runningJobs == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
String containerName = runningJobs.getContainerName();
|
||||
|
||||
try {
|
||||
boolean containerAlive = isContainerRunning(containerName);
|
||||
|
||||
if (containerAlive) {
|
||||
// 컨테이너 살아있으면 → RUNNING 유지
|
||||
log.info("[RECOVERY] container still running: {}", containerName);
|
||||
|
||||
} else {
|
||||
// 컨테이너 죽었으면 → FAILED 처리
|
||||
log.info("[RECOVERY] container not found. mark FAILED: {}", containerName);
|
||||
|
||||
modelTrainJobCoreService.markFailed(
|
||||
runningJobs.getId(), null, "SERVER_RESTART_CONTAINER_NOT_FOUND");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("[RECOVERY] container check failed. mark FAILED: {}", containerName, e);
|
||||
|
||||
modelTrainJobCoreService.markFailed(
|
||||
runningJobs.getId(), null, "SERVER_RESTART_CONTAINER_CHECK_ERROR");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* docker 실행중인지 확인하기
|
||||
*
|
||||
* @param containerName container name
|
||||
* @return true, false
|
||||
* @throws IOException
|
||||
*/
|
||||
private boolean isContainerRunning(String containerName) throws IOException {
|
||||
|
||||
ProcessBuilder pb =
|
||||
new ProcessBuilder("docker", "inspect", "-f", "{{.State.Running}}", containerName);
|
||||
|
||||
Process p = pb.start();
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(p.getInputStream()));
|
||||
|
||||
String line = br.readLine();
|
||||
return "true".equals(line);
|
||||
}
|
||||
}
|
||||
@@ -30,10 +30,12 @@ public class TrainJobWorker {
|
||||
private final DockerTrainService dockerTrainService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
@Async
|
||||
@Async("trainJobExecutor")
|
||||
@TransactionalEventListener(phase = TransactionPhase.AFTER_COMMIT)
|
||||
public void handle(ModelTrainJobQueuedEvent event) {
|
||||
|
||||
log.info("[JOB] thread={}, jobId={}", Thread.currentThread().getName(), event.getJobId());
|
||||
|
||||
Long jobId = event.getJobId();
|
||||
|
||||
ModelTrainJobDto job =
|
||||
@@ -89,7 +91,6 @@ public class TrainJobWorker {
|
||||
|
||||
// 도커 실행 후 로그 수집
|
||||
result = dockerTrainService.runEvalSync(containerName, evalReq);
|
||||
|
||||
} else {
|
||||
// step1 진행중 처리
|
||||
modelTrainMngCoreService.markStep1InProgress(modelId, jobId);
|
||||
|
||||
Reference in New Issue
Block a user