From ebad9c356e4a4ca3da070f3795e7137e7ecce291 Mon Sep 17 00:00:00 2001 From: Anton Okhotnikov Date: Tue, 22 Aug 2023 16:05:11 +0000 Subject: [PATCH] [upd] Update lib version and add download warning --- examples/README.md | 2 ++ examples/load_all_examples.py | 20 ++++++++------------ examples/requirements.txt | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/examples/README.md b/examples/README.md index ff2c134..18c6939 100644 --- a/examples/README.md +++ b/examples/README.md @@ -101,6 +101,8 @@ cd VoxTube/examples python3 load_example.py ../resources/meta/UC-9GWCoQoMr_ey6AMhClStQ.json # example of downloading the whole dataset in N parallel jobs +# WARNING: you might run into HTTP Error 429 if there are too many requests +# (parallel jobs) used, decrease -j parameter in this case python3 load_all_examples.py -r -j N ``` diff --git a/examples/load_all_examples.py b/examples/load_all_examples.py index bd232fe..4205791 100755 --- a/examples/load_all_examples.py +++ b/examples/load_all_examples.py @@ -2,17 +2,20 @@ import argparse import multiprocessing as mp import os -import subprocess as sp from functools import partial from pathlib import Path from tqdm import tqdm +from load_example import download_process_and_cut_channel_videos -def load_json(json_path, dataset_root, load_script_path): + +def load_json(json_path, dataset_root): try: - status = sp.run(["python3", load_script_path, str(json_path), dataset_root]) - return status + download_process_and_cut_channel_videos( + json_path, + dataset_root + ) except Exception as e: print(f'Error while loading channel {json_path}') print(f'Exception: {str(e)}') @@ -23,20 +26,13 @@ def main(dataset_root, nj=1): fwd = os.path.dirname(os.path.realpath(__file__)) meta_path = Path(f'{fwd}/../resources/meta') json_paths = sorted(list(meta_path.glob('*.json'))) - path_to_download_script = f'{fwd}/load_example.py' # Run downloading load_job = partial( load_json, - dataset_root=dataset_root, - load_script_path=path_to_download_script + dataset_root=dataset_root ) - # with mp.Pool(nj) as pool: - # _ = pool.imap( - # load_job, tqdm(json_paths, total=len(json_paths)) - # ) - with tqdm(total=len(json_paths)) as pb: with mp.Pool(nj) as pool: for _ in pool.imap(load_job, json_paths): diff --git a/examples/requirements.txt b/examples/requirements.txt index 2b85faf..ea267e7 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -1,3 +1,3 @@ -yt-dlp==2023.3.3 +yt-dlp==2023.7.6 soundfile==0.12.1 tqdm==4.64.1