Unverified 提交 b3678601 authored 作者: Glenn Jocher's avatar Glenn Jocher 提交者: GitHub

New `HUBDatasetStats()` class (#8716)

* New `HUBDatasetStats()` class Usage examples: ``` from utils.dataloaders import * stats = HUBDatasetStats('coco128.yaml', autodownload=True) # method 1 stats = HUBDatasetStats('path/to/coco128_with_yaml.zip') # method 1 stats.get_json(save=False) stats.process_images() ``` @kalenmike * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update dataloaders.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update dataloaders.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update dataloaders.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update dataloaders.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ciCo-authored-by: 's avatarpre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
上级 a6f197ae
...@@ -977,21 +977,35 @@ def verify_image_label(args): ...@@ -977,21 +977,35 @@ def verify_image_label(args):
return [None, None, None, None, nm, nf, ne, nc, msg] return [None, None, None, None, nm, nf, ne, nc, msg]
def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False): class HUBDatasetStats():
""" Return dataset statistics dictionary with images and instances counts per split per class """ Return dataset statistics dictionary with images and instances counts per split per class
To run in parent directory: export PYTHONPATH="$PWD/yolov5" To run in parent directory: export PYTHONPATH="$PWD/yolov5"
Usage1: from utils.dataloaders import *; dataset_stats('coco128.yaml', autodownload=True) Usage1: from utils.dataloaders import *; HUBDatasetStats('coco128.yaml', autodownload=True)
Usage2: from utils.dataloaders import *; dataset_stats('path/to/coco128_with_yaml.zip') Usage2: from utils.dataloaders import *; HUBDatasetStats('path/to/coco128_with_yaml.zip')
Arguments Arguments
path: Path to data.yaml or data.zip (with data.yaml inside data.zip) path: Path to data.yaml or data.zip (with data.yaml inside data.zip)
autodownload: Attempt to download dataset if not found locally autodownload: Attempt to download dataset if not found locally
verbose: Print stats dictionary
""" """
def _round_labels(labels): def __init__(self, path='coco128.yaml', autodownload=False):
# Update labels to integer class and 6 decimal place floats # Initialize class
return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels] zipped, data_dir, yaml_path = self._unzip(Path(path))
try:
with open(check_yaml(yaml_path), errors='ignore') as f:
data = yaml.safe_load(f) # data dict
if zipped:
data['path'] = data_dir
except Exception as e:
raise Exception("error/HUB/dataset_stats/yaml_load") from e
check_dataset(data, autodownload) # download dataset if missing
self.hub_dir = Path(data['path'] + '-hub')
self.im_dir = self.hub_dir / 'images'
self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images
self.stats = {'nc': data['nc'], 'names': data['names']} # statistics dictionary
self.data = data
@staticmethod
def _find_yaml(dir): def _find_yaml(dir):
# Return data.yaml file # Return data.yaml file
files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive
...@@ -1002,7 +1016,7 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil ...@@ -1002,7 +1016,7 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}' assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
return files[0] return files[0]
def _unzip(path): def _unzip(self, path):
# Unzip data.zip # Unzip data.zip
if not str(path).endswith('.zip'): # path is data.yaml if not str(path).endswith('.zip'): # path is data.yaml
return False, None, path return False, None, path
...@@ -1010,11 +1024,11 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil ...@@ -1010,11 +1024,11 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
ZipFile(path).extractall(path=path.parent) # unzip ZipFile(path).extractall(path=path.parent) # unzip
dir = path.with_suffix('') # dataset directory == zip name dir = path.with_suffix('') # dataset directory == zip name
assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/' assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
return True, str(dir), _find_yaml(dir) # zipped, data_dir, yaml_path return True, str(dir), self._find_yaml(dir) # zipped, data_dir, yaml_path
def _hub_ops(f, max_dim=1920): def _hub_ops(self, f, max_dim=1920):
# HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
f_new = im_dir / Path(f).name # dataset-hub image filename f_new = self.im_dir / Path(f).name # dataset-hub image filename
try: # use PIL try: # use PIL
im = Image.open(f) im = Image.open(f)
r = max_dim / max(im.height, im.width) # ratio r = max_dim / max(im.height, im.width) # ratio
...@@ -1030,69 +1044,49 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil ...@@ -1030,69 +1044,49 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA) im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
cv2.imwrite(str(f_new), im) cv2.imwrite(str(f_new), im)
zipped, data_dir, yaml_path = _unzip(Path(path)) def get_json(self, save=False, verbose=False):
try: # Return dataset JSON for Ultralytics HUB
with open(check_yaml(yaml_path), errors='ignore') as f: def _round(labels):
data = yaml.safe_load(f) # data dict # Update labels to integer class and 6 decimal place floats
if zipped: return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
data['path'] = data_dir # TODO: should this be dir.resolve()?`
except Exception: for split in 'train', 'val', 'test':
raise Exception("error/HUB/dataset_stats/yaml_load") if self.data.get(split) is None:
self.stats[split] = None # i.e. no test set
check_dataset(data, autodownload) # download dataset if missing continue
hub_dir = Path(data['path'] + ('-hub' if hub else '')) dataset = LoadImagesAndLabels(self.data[split]) # load dataset
stats = {'nc': data['nc'], 'names': data['names']} # statistics dictionary x = np.array([
for split in 'train', 'val', 'test': np.bincount(label[:, 0].astype(int), minlength=self.data['nc'])
if data.get(split) is None: for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics')]) # shape(128x80)
stats[split] = None # i.e. no test set self.stats[split] = {
continue 'instance_stats': {
x = [] 'total': int(x.sum()),
dataset = LoadImagesAndLabels(data[split]) # load dataset 'per_class': x.sum(0).tolist()},
for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics'): 'image_stats': {
x.append(np.bincount(label[:, 0].astype(int), minlength=data['nc'])) 'total': dataset.n,
x = np.array(x) # shape(128x80) 'unlabelled': int(np.all(x == 0, 1).sum()),
stats[split] = { 'per_class': (x > 0).sum(0).tolist()},
'instance_stats': { 'labels': [{
'total': int(x.sum()), str(Path(k).name): _round(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
'per_class': x.sum(0).tolist()},
'image_stats': { # Save, print and return
'total': dataset.n, if save:
'unlabelled': int(np.all(x == 0, 1).sum()), stats_path = self.hub_dir / 'stats.json'
'per_class': (x > 0).sum(0).tolist()}, print(f'Saving {stats_path.resolve()}...')
'labels': [{ with open(stats_path, 'w') as f:
str(Path(k).name): _round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]} json.dump(self.stats, f) # save stats.json
if verbose:
if hub: print(json.dumps(self.stats, indent=2, sort_keys=False))
im_dir = hub_dir / 'images' return self.stats
im_dir.mkdir(parents=True, exist_ok=True)
for _ in tqdm(ThreadPool(NUM_THREADS).imap(_hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'): def process_images(self):
# Compress images for Ultralytics HUB
for split in 'train', 'val', 'test':
if self.data.get(split) is None:
continue
dataset = LoadImagesAndLabels(self.data[split]) # load dataset
desc = f'{split} images'
for _ in tqdm(ThreadPool(NUM_THREADS).imap(self._hub_ops, dataset.im_files), total=dataset.n, desc=desc):
pass pass
print(f'Done. All images saved to {self.im_dir}')
# Profile return self.im_dir
stats_path = hub_dir / 'stats.json'
if profile:
for _ in range(1):
file = stats_path.with_suffix('.npy')
t1 = time.time()
np.save(file, stats)
t2 = time.time()
x = np.load(file, allow_pickle=True)
print(f'stats.npy times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
file = stats_path.with_suffix('.json')
t1 = time.time()
with open(file, 'w') as f:
json.dump(stats, f) # save stats *.json
t2 = time.time()
with open(file) as f:
x = json.load(f) # load hyps dict
print(f'stats.json times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
# Save, print and return
if hub:
print(f'Saving {stats_path.resolve()}...')
with open(stats_path, 'w') as f:
json.dump(stats, f) # save stats.json
if verbose:
print(json.dumps(stats, indent=2, sort_keys=False))
return stats
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论