Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Y
yolov5
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
Administrator
yolov5
Commits
518c0957
Unverified
提交
518c0957
authored
3月 28, 2021
作者:
Ayush Chaurasia
提交者:
GitHub
3月 28, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
W&B resume ddp from run link fix (#2579)
* W&B resume ddp from run link fix * Native DDP W&B support for training, resuming
上级
dc51e80b
显示空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
53 行增加
和
15 行删除
+53
-15
train.py
train.py
+2
-2
wandb_utils.py
utils/wandb_logging/wandb_utils.py
+51
-13
没有找到文件。
train.py
浏览文件 @
518c0957
...
...
@@ -33,7 +33,7 @@ from utils.google_utils import attempt_download
from
utils.loss
import
ComputeLoss
from
utils.plots
import
plot_images
,
plot_labels
,
plot_results
,
plot_evolution
from
utils.torch_utils
import
ModelEMA
,
select_device
,
intersect_dicts
,
torch_distributed_zero_first
,
is_parallel
from
utils.wandb_logging.wandb_utils
import
WandbLogger
,
resume_and_get_id
from
utils.wandb_logging.wandb_utils
import
WandbLogger
,
check_wandb_resume
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -496,7 +496,7 @@ if __name__ == '__main__':
check_requirements
()
# Resume
wandb_run
=
resume_and_get_id
(
opt
)
wandb_run
=
check_wandb_resume
(
opt
)
if
opt
.
resume
and
not
wandb_run
:
# resume an interrupted run
ckpt
=
opt
.
resume
if
isinstance
(
opt
.
resume
,
str
)
else
get_latest_run
()
# specified or most recent path
assert
os
.
path
.
isfile
(
ckpt
),
'ERROR: --resume checkpoint does not exist'
...
...
utils/wandb_logging/wandb_utils.py
浏览文件 @
518c0957
...
...
@@ -23,7 +23,7 @@ except ImportError:
WANDB_ARTIFACT_PREFIX
=
'wandb-artifact://'
def
remove_prefix
(
from_string
,
prefix
):
def
remove_prefix
(
from_string
,
prefix
=
WANDB_ARTIFACT_PREFIX
):
return
from_string
[
len
(
prefix
):]
...
...
@@ -33,35 +33,73 @@ def check_wandb_config_file(data_config_file):
return
wandb_config
return
data_config_file
def
get_run_info
(
run_path
):
run_path
=
Path
(
remove_prefix
(
run_path
,
WANDB_ARTIFACT_PREFIX
))
run_id
=
run_path
.
stem
project
=
run_path
.
parent
.
stem
model_artifact_name
=
'run_'
+
run_id
+
'_model'
return
run_id
,
project
,
model_artifact_name
def
resume_and_get_id
(
opt
):
# It's more elegant to stick to 1 wandb.init call, but as useful config data is overwritten in the WandbLogger's wandb.init call
def
check_wandb_resume
(
opt
):
process_wandb_config_ddp_mode
(
opt
)
if
opt
.
global_rank
not
in
[
-
1
,
0
]
else
None
if
isinstance
(
opt
.
resume
,
str
):
if
opt
.
resume
.
startswith
(
WANDB_ARTIFACT_PREFIX
):
run_path
=
Path
(
remove_prefix
(
opt
.
resume
,
WANDB_ARTIFACT_PREFIX
))
run_id
=
run_path
.
stem
project
=
run_path
.
parent
.
stem
model_artifact_name
=
WANDB_ARTIFACT_PREFIX
+
'run_'
+
run_id
+
'_model'
assert
wandb
,
'install wandb to resume wandb runs'
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
run
=
wandb
.
init
(
id
=
run_id
,
project
=
project
,
resume
=
'allow'
)
opt
.
resume
=
model_artifact_name
return
run
if
opt
.
global_rank
not
in
[
-
1
,
0
]:
# For resuming DDP runs
run_id
,
project
,
model_artifact_name
=
get_run_info
(
opt
.
resume
)
api
=
wandb
.
Api
()
artifact
=
api
.
artifact
(
project
+
'/'
+
model_artifact_name
+
':latest'
)
modeldir
=
artifact
.
download
()
opt
.
weights
=
str
(
Path
(
modeldir
)
/
"last.pt"
)
return
True
return
None
def
process_wandb_config_ddp_mode
(
opt
):
with
open
(
opt
.
data
)
as
f
:
data_dict
=
yaml
.
load
(
f
,
Loader
=
yaml
.
SafeLoader
)
# data dict
train_dir
,
val_dir
=
None
,
None
if
data_dict
[
'train'
]
.
startswith
(
WANDB_ARTIFACT_PREFIX
):
api
=
wandb
.
Api
()
train_artifact
=
api
.
artifact
(
remove_prefix
(
data_dict
[
'train'
])
+
':'
+
opt
.
artifact_alias
)
train_dir
=
train_artifact
.
download
()
train_path
=
Path
(
train_dir
)
/
'data/images/'
data_dict
[
'train'
]
=
str
(
train_path
)
if
data_dict
[
'val'
]
.
startswith
(
WANDB_ARTIFACT_PREFIX
):
api
=
wandb
.
Api
()
val_artifact
=
api
.
artifact
(
remove_prefix
(
data_dict
[
'val'
])
+
':'
+
opt
.
artifact_alias
)
val_dir
=
val_artifact
.
download
()
val_path
=
Path
(
val_dir
)
/
'data/images/'
data_dict
[
'val'
]
=
str
(
val_path
)
if
train_dir
or
val_dir
:
ddp_data_path
=
str
(
Path
(
val_dir
)
/
'wandb_local_data.yaml'
)
with
open
(
ddp_data_path
,
'w'
)
as
f
:
yaml
.
dump
(
data_dict
,
f
)
opt
.
data
=
ddp_data_path
class
WandbLogger
():
def
__init__
(
self
,
opt
,
name
,
run_id
,
data_dict
,
job_type
=
'Training'
):
# Pre-training routine --
self
.
job_type
=
job_type
self
.
wandb
,
self
.
wandb_run
,
self
.
data_dict
=
wandb
,
None
if
not
wandb
else
wandb
.
run
,
data_dict
if
self
.
wandb
:
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
if
isinstance
(
opt
.
resume
,
str
):
# checks resume from artifact
if
opt
.
resume
.
startswith
(
WANDB_ARTIFACT_PREFIX
):
run_id
,
project
,
model_artifact_name
=
get_run_info
(
opt
.
resume
)
model_artifact_name
=
WANDB_ARTIFACT_PREFIX
+
model_artifact_name
assert
wandb
,
'install wandb to resume wandb runs'
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
self
.
wandb_run
=
wandb
.
init
(
id
=
run_id
,
project
=
project
,
resume
=
'allow'
)
opt
.
resume
=
model_artifact_name
elif
self
.
wandb
:
self
.
wandb_run
=
wandb
.
init
(
config
=
opt
,
resume
=
"allow"
,
project
=
'YOLOv5'
if
opt
.
project
==
'runs/train'
else
Path
(
opt
.
project
)
.
stem
,
name
=
name
,
job_type
=
job_type
,
id
=
run_id
)
if
not
wandb
.
run
else
wandb
.
run
if
self
.
wandb_run
:
if
self
.
job_type
==
'Training'
:
if
not
opt
.
resume
:
wandb_data_dict
=
self
.
check_and_upload_dataset
(
opt
)
if
opt
.
upload_dataset
else
data_dict
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论