Parse URL authentication (#3424)

* Parse URL authentication * urllib.parse.unquote() * improved error handling * improved error handling * remove %3F * update check_file()

Parse URL authentication (#3424)
f8651c38 · Glenn Jocher · GitHub · 3cb9ad4f · f8651c38 · f8651c38
--- a/utils/general.py
+++ b/utils/general.py
@@ -9,6 +9,7 @@ import random
 import re
 import subprocess
 import time
+import urllib
 from itertools import repeat
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
@@ -183,7 +184,8 @@ def check_file(file):
    if Path(file).is_file() or file == '':  # exists
        return file
    elif file.startswith(('http://', 'https://')):  # download
-        url, file = file, Path(file).name
+        url, file = file, Path(urllib.parse.unquote(str(file))).name  # url, file (decode '%2F' to '/' etc.)
+        file = file.split('?')[0]  # parse authentication https://url.com/file.txt?auth...
        print(f'Downloading {url} to {file}...')
        torch.hub.download_url_to_file(url, file)
        assert Path(file).exists() and Path(file).stat().st_size > 0, f'File download failed: {url}'  # check

--- a/utils/google_utils.py
+++ b/utils/google_utils.py
@@ -4,6 +4,7 @@ import os
 import platform
 import subprocess
 import time
+import urllib
 from pathlib import Path

 import requests
@@ -19,30 +20,32 @@ def gsutil_getsize(url=''):
 def safe_download(file, url, url2=None, min_bytes=1E0, error_msg=''):
    # Attempts to download file from url or url2, checks and removes incomplete downloads < min_bytes
    file = Path(file)
-    try:  # GitHub
+    assert_msg = f"Downloaded file '{file}' does not exist or size is < min_bytes={min_bytes}"
+    try:  # url1
        print(f'Downloading {url} to {file}...')
        torch.hub.download_url_to_file(url, str(file))
-        assert file.exists() and file.stat().st_size > min_bytes  # check
-    except Exception as e:  # GCP
+        assert file.exists() and file.stat().st_size > min_bytes, assert_msg  # check
+    except Exception as e:  # url2
        file.unlink(missing_ok=True)  # remove partial downloads
-        print(f'Download error: {e}\nRe-attempting {url2 or url} to {file}...')
+        print(f'ERROR: {e}\nRe-attempting {url2 or url} to {file}...')
        os.system(f"curl -L '{url2 or url}' -o '{file}' --retry 3 -C -")  # curl download, retry and resume on fail
    finally:
        if not file.exists() or file.stat().st_size < min_bytes:  # check
            file.unlink(missing_ok=True)  # remove partial downloads
-            print(f'ERROR: Download failure: {error_msg or url}')
+            print(f"ERROR: {assert_msg}\n{error_msg}")
        print('')


-def attempt_download(file, repo='ultralytics/yolov5'):
+def attempt_download(file, repo='ultralytics/yolov5'):  # from utils.google_utils import *; attempt_download()
    # Attempt file download if does not exist
    file = Path(str(file).strip().replace("'", ''))

    if not file.exists():
        # URL specified
-        name = file.name
+        name = Path(urllib.parse.unquote(str(file))).name  # decode '%2F' to '/' etc.
        if str(file).startswith(('http:/', 'https:/')):  # download
            url = str(file).replace(':/', '://')  # Pathlib turns :// -> :/
+            name = name.split('?')[0]  # parse authentication https://url.com/file.txt?auth...
            safe_download(file=name, url=url, min_bytes=1E5)
            return name