Unverified 提交 ca290dca authored 作者: Ayush Chaurasia's avatar Ayush Chaurasia 提交者: GitHub

Weights & Biases (W&B) Feature Addition (#1235)

* Add wandb metric logging and bounding box debugging * Improve formatting, readability * Remove mutliple path for init, improve formatting * Add wandb params * Remove typecasting in bbox coordinates and reformat * Cleanup * add wandb to requirements.txt * minor updates to test.py * general reorg * reduce --log-imgs to 10 * clean wandb import * reverse wandb import assert * add except AssertionError to try import * move wandb init to all global ranks * replace print() with logger.info() * replace print() with logger.info() * move wandb.init() bug fix * project PosixPath to basename bug fix Co-authored-by: 's avatarGlenn Jocher <glenn.jocher@ultralytics.com>
上级 c8c5ef36
...@@ -13,6 +13,9 @@ torch>=1.6.0 ...@@ -13,6 +13,9 @@ torch>=1.6.0
torchvision>=0.7.0 torchvision>=0.7.0
tqdm>=4.41.0 tqdm>=4.41.0
# logging -------------------------------------
# wandb
# coco ---------------------------------------- # coco ----------------------------------------
# pycocotools>=2.0 # pycocotools>=2.0
......
...@@ -33,7 +33,9 @@ def test(data, ...@@ -33,7 +33,9 @@ def test(data,
save_dir=Path(''), # for saving images save_dir=Path(''), # for saving images
save_txt=False, # for auto-labelling save_txt=False, # for auto-labelling
save_conf=False, save_conf=False,
plots=True): plots=True,
log_imgs=0): # number of logged images
# Initialize/load model and set device # Initialize/load model and set device
training = model is not None training = model is not None
if training: # called by train.py if training: # called by train.py
...@@ -77,6 +79,13 @@ def test(data, ...@@ -77,6 +79,13 @@ def test(data,
iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95 iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95
niou = iouv.numel() niou = iouv.numel()
# Logging
log_imgs = min(log_imgs, 100) # ceil
try:
import wandb # Weights & Biases
except ImportError:
log_imgs = 0
# Dataloader # Dataloader
if not training: if not training:
img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img
...@@ -91,7 +100,7 @@ def test(data, ...@@ -91,7 +100,7 @@ def test(data,
s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95') s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0. p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
loss = torch.zeros(3, device=device) loss = torch.zeros(3, device=device)
jdict, stats, ap, ap_class = [], [], [], [] jdict, stats, ap, ap_class, wandb_images = [], [], [], [], []
for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)): for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
img = img.to(device, non_blocking=True) img = img.to(device, non_blocking=True)
img = img.half() if half else img.float() # uint8 to fp16/32 img = img.half() if half else img.float() # uint8 to fp16/32
...@@ -139,6 +148,14 @@ def test(data, ...@@ -139,6 +148,14 @@ def test(data,
with open(str(out / Path(paths[si]).stem) + '.txt', 'a') as f: with open(str(out / Path(paths[si]).stem) + '.txt', 'a') as f:
f.write(('%g ' * len(line) + '\n') % line) f.write(('%g ' * len(line) + '\n') % line)
# W&B logging
if len(wandb_images) < log_imgs:
bbox_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
"class_id": int(cls),
"scores": {"class_score": conf},
"domain": "pixel"} for *xyxy, conf, cls in pred.clone().tolist()]
wandb_images.append(wandb.Image(img[si], boxes={"predictions": {"box_data": bbox_data}}))
# Clip boxes to image bounds # Clip boxes to image bounds
clip_coords(pred, (height, width)) clip_coords(pred, (height, width))
...@@ -196,6 +213,10 @@ def test(data, ...@@ -196,6 +213,10 @@ def test(data,
f = save_dir / f'test_batch{batch_i}_pred.jpg' f = save_dir / f'test_batch{batch_i}_pred.jpg'
plot_images(img, output_to_target(output, width, height), paths, str(f), names) # predictions plot_images(img, output_to_target(output, width, height), paths, str(f), names) # predictions
# W&B logging
if wandb_images:
wandb.log({"outputs": wandb_images})
# Compute statistics # Compute statistics
stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy
if len(stats) and stats[0].any(): if len(stats) and stats[0].any():
......
...@@ -33,7 +33,7 @@ from utils.torch_utils import ModelEMA, select_device, intersect_dicts ...@@ -33,7 +33,7 @@ from utils.torch_utils import ModelEMA, select_device, intersect_dicts
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def train(hyp, opt, device, tb_writer=None): def train(hyp, opt, device, tb_writer=None, wandb=None):
logger.info(f'Hyperparameters {hyp}') logger.info(f'Hyperparameters {hyp}')
log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
wdir = log_dir / 'weights' # weights directory wdir = log_dir / 'weights' # weights directory
...@@ -118,6 +118,11 @@ def train(hyp, opt, device, tb_writer=None): ...@@ -118,6 +118,11 @@ def train(hyp, opt, device, tb_writer=None):
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
# plot_lr_scheduler(optimizer, scheduler, epochs) # plot_lr_scheduler(optimizer, scheduler, epochs)
# Logging
if wandb and wandb.run is None:
id = ckpt.get('wandb_id') if 'ckpt' in locals() else None
wandb_run = wandb.init(config=opt, resume="allow", project=os.path.basename(log_dir), id=id)
# Resume # Resume
start_epoch, best_fitness = 0, 0.0 start_epoch, best_fitness = 0, 0.0
if pretrained: if pretrained:
...@@ -317,7 +322,8 @@ def train(hyp, opt, device, tb_writer=None): ...@@ -317,7 +322,8 @@ def train(hyp, opt, device, tb_writer=None):
single_cls=opt.single_cls, single_cls=opt.single_cls,
dataloader=testloader, dataloader=testloader,
save_dir=log_dir, save_dir=log_dir,
plots=epoch == 0 or final_epoch) # plot first and last plots=epoch == 0 or final_epoch, # plot first and last
log_imgs=opt.log_imgs)
# Write # Write
with open(results_file, 'a') as f: with open(results_file, 'a') as f:
...@@ -325,14 +331,16 @@ def train(hyp, opt, device, tb_writer=None): ...@@ -325,14 +331,16 @@ def train(hyp, opt, device, tb_writer=None):
if len(opt.name) and opt.bucket: if len(opt.name) and opt.bucket:
os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
# Tensorboard # Log
if tb_writer: tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss
tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss
'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2'] # params
'x/lr0', 'x/lr1', 'x/lr2'] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer:
tb_writer.add_scalar(tag, x, epoch) tb_writer.add_scalar(tag, x, epoch) # tensorboard
if wandb:
wandb.log({tag: x}) # W&B
# Update best mAP # Update best mAP
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95] fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
...@@ -347,7 +355,8 @@ def train(hyp, opt, device, tb_writer=None): ...@@ -347,7 +355,8 @@ def train(hyp, opt, device, tb_writer=None):
'best_fitness': best_fitness, 'best_fitness': best_fitness,
'training_results': f.read(), 'training_results': f.read(),
'model': ema.ema, 'model': ema.ema,
'optimizer': None if final_epoch else optimizer.state_dict()} 'optimizer': None if final_epoch else optimizer.state_dict(),
'wandb_id': wandb_run.id if wandb else None}
# Save last, best and delete # Save last, best and delete
torch.save(ckpt, last) torch.save(ckpt, last)
...@@ -403,7 +412,9 @@ if __name__ == '__main__': ...@@ -403,7 +412,9 @@ if __name__ == '__main__':
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
parser.add_argument('--logdir', type=str, default='runs/', help='logging directory') parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
parser.add_argument('--log-imgs', type=int, default=10, help='number of images for W&B logging, max 100')
parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers') parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
opt = parser.parse_args() opt = parser.parse_args()
# Set DDP variables # Set DDP variables
...@@ -452,12 +463,23 @@ if __name__ == '__main__': ...@@ -452,12 +463,23 @@ if __name__ == '__main__':
# Train # Train
logger.info(opt) logger.info(opt)
if not opt.evolve: if not opt.evolve:
tb_writer = None tb_writer, wandb = None, None # init loggers
if opt.global_rank in [-1, 0]: if opt.global_rank in [-1, 0]:
# Tensorboard
logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/') logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/')
tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0 tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0
train(hyp, opt, device, tb_writer) # W&B
try:
import wandb
assert os.environ.get('WANDB_DISABLED') != 'true'
logger.info("Weights & Biases logging enabled, to disable set os.environ['WANDB_DISABLED'] = 'true'")
except (ImportError, AssertionError):
opt.log_imgs = 0
logger.info("Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)")
train(hyp, opt, device, tb_writer, wandb)
# Evolve hyperparameters (optional) # Evolve hyperparameters (optional)
else: else:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论