10 Commits

4 changed files with 24 additions and 6 deletions

View File

@@ -181,15 +181,15 @@ public class ModelHyperParamEntity {
private String metrics = "mFscore,mIoU";
/** Default: changed_fscore */
@Size(max = 30)
@Size(max = 100)
@NotNull
@Column(name = "save_best", nullable = false, length = 30)
@Column(name = "save_best", nullable = false, length = 100)
private String saveBest = "changed_fscore";
/** Default: greater */
@Size(max = 10)
@Size(max = 100)
@NotNull
@Column(name = "save_best_rule", nullable = false, length = 10)
@Column(name = "save_best_rule", nullable = false, length = 100)
private String saveBestRule = "greater";
/** Default: 1 */

View File

@@ -56,6 +56,13 @@ public class DockerTrainService {
@Value("${spring.profiles.active}")
private String profile;
@Value("${hyper.parameter.gpus}")
private String hyperGpus;
@Value("${hyper.parameter.gpu-ids}")
private String hyperGpuIds;
private final ModelTrainJobCoreService modelTrainJobCoreService;
/**
@@ -285,11 +292,13 @@ public class DockerTrainService {
// addArg(c, "--gpu-ids", req.getGpuIds()); // null
if ("prod".equals(profile)) {
addArg(c, "--batch-size", 2); // 학습서버 GPU 1개인 곳은 batch-size:2 까지만 가능
addArg(c, "--gpus", "1"); // 학습서버 GPU 1개인 곳은 1이어야 함
addArg(c, "--gpu-ids", "0"); // 학습서버 GPU 1개인 곳은 0이어야 함
} else {
addArg(c, "--batch-size", req.getBatchSize()); // 학습서버 GPU 1개인 곳은 batch-size:2 까지만 가능
}
addArg(c, "--gpus", hyperGpus); // 학습서버 GPU 1개인 곳은 1이어야 함
addArg(c, "--gpu-ids", hyperGpuIds); // 학습서버 GPU 1개인 곳은 0이어야 함
addArg(c, "--lr", req.getLearningRate());
addArg(c, "--backbone", req.getBackbone());
addArg(c, "--epochs", req.getEpochs());

View File

@@ -41,3 +41,7 @@ train:
container_prefix: kamco-cd-train
shm_size: 16g
ipc_host: true
hyper:
parameter:
gpus: 4
gpu-ids: 0,1,2,3

View File

@@ -78,3 +78,8 @@ management:
exposure:
include:
- "health"
hyper:
parameter:
gpus: 1
gpu-ids: 0