실행 오류 수정
This commit is contained in:
@@ -9,9 +9,11 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import lombok.extern.log4j.Log4j2;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Log4j2
|
||||
@Service
|
||||
public class DockerTrainService {
|
||||
|
||||
@@ -44,6 +46,11 @@ public class DockerTrainService {
|
||||
|
||||
List<String> cmd = buildDockerRunCommand(containerName, req);
|
||||
|
||||
log.info("=== Docker Train Command ===");
|
||||
log.info("Container: {}", containerName);
|
||||
log.info("Command: {}", String.join(" ", cmd));
|
||||
log.info("================================");
|
||||
|
||||
ProcessBuilder pb = new ProcessBuilder(cmd);
|
||||
pb.redirectErrorStream(true);
|
||||
|
||||
@@ -121,23 +128,11 @@ public class DockerTrainService {
|
||||
|
||||
// 컨테이너 이름 지정
|
||||
c.add("--name");
|
||||
c.add(containerName);
|
||||
c.add(containerName + "-" + req.getOutputFolder().substring(0, 8));
|
||||
|
||||
// 실행 종료 시 자동 삭제
|
||||
c.add("--rm");
|
||||
|
||||
// 환경변수 설정
|
||||
c.add("-e");
|
||||
c.add("OPENCV_LOG_LEVEL=ERROR");
|
||||
c.add("-e");
|
||||
c.add("NCCL_DEBUG=INFO");
|
||||
c.add("-e");
|
||||
c.add("NCCL_IB_DISABLE=1");
|
||||
c.add("-e");
|
||||
c.add("NCCL_P2P_DISABLE=0");
|
||||
c.add("-e");
|
||||
c.add("NCCL_SOCKET_IFNAME=eth0");
|
||||
|
||||
// GPU 전체 사용
|
||||
c.add("--gpus");
|
||||
c.add("all");
|
||||
@@ -156,6 +151,18 @@ public class DockerTrainService {
|
||||
c.add("--ulimit");
|
||||
c.add("stack=67108864");
|
||||
|
||||
// 환경변수 설정
|
||||
c.add("-e");
|
||||
c.add("OPENCV_LOG_LEVEL=ERROR");
|
||||
c.add("-e");
|
||||
c.add("NCCL_DEBUG=INFO");
|
||||
c.add("-e");
|
||||
c.add("NCCL_IB_DISABLE=1");
|
||||
c.add("-e");
|
||||
c.add("NCCL_P2P_DISABLE=0");
|
||||
c.add("-e");
|
||||
c.add("NCCL_SOCKET_IFNAME=eth0");
|
||||
|
||||
// 요청/결과 디렉토리 볼륨 마운트
|
||||
c.add("-v");
|
||||
c.add(requestDir + ":/data");
|
||||
|
||||
@@ -67,8 +67,6 @@ public class TrainJobService {
|
||||
|
||||
// 커밋 이후 Worker 실행 트리거(리스너에서 AFTER_COMMIT로 받아야 함)
|
||||
eventPublisher.publishEvent(new ModelTrainJobQueuedEvent(jobId));
|
||||
|
||||
modelTrainMngCoreService.markStep1InProgress(modelId, jobId);
|
||||
return jobId;
|
||||
}
|
||||
|
||||
|
||||
@@ -55,6 +55,7 @@ public class TrainJobWorker {
|
||||
TrainRunResult result;
|
||||
|
||||
if (isEval) {
|
||||
modelTrainMngCoreService.markStep2InProgress(modelId, jobId);
|
||||
String uuid = String.valueOf(params.get("uuid"));
|
||||
int epoch = (int) params.get("epoch");
|
||||
|
||||
@@ -62,6 +63,7 @@ public class TrainJobWorker {
|
||||
result = dockerTrainService.runEvalSync(evalReq, containerName);
|
||||
|
||||
} else {
|
||||
modelTrainMngCoreService.markStep1InProgress(modelId, jobId);
|
||||
TrainRunRequest trainReq = toTrainRunRequest(params);
|
||||
result = dockerTrainService.runTrainSync(trainReq, containerName);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user