-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubmissions.py
60 lines (53 loc) · 2.15 KB
/
submissions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from helpers import AbstractTool
from typing import Union
import logging
import downloading
import extraction
import processing
import verification
class SubmissionTool(AbstractTool):
def download(
self,
since: Union[str, int],
until: Union[str, int, None],
force: bool = False,
checkhash: bool = False,
checksize: bool = False,
retry: bool = False,
max_attempts: int = 3,
) -> None:
self._initialize_dates(since, until)
logging.info(f"Downloading available submission dumps from {self._get_date_range_str()}")
for p in self.periods:
downloading.download_dump(
prefix="RS",
year=p[0],
month=p[1],
force=force,
checkhash=checkhash,
checksize=checksize,
retry=retry,
max_attempts=max_attempts,
)
def extract(
self, since: Union[str, int], until: Union[str, int, None], subreddit: str, force: bool = False
) -> None:
self._initialize_dates(since, until)
subreddit = subreddit.lower().strip()
logging.info(f"Extracting downloaded comments for subreddit '{subreddit}' from {self._get_date_range_str()}")
for p in self.periods:
extraction.extract_from_dump("RS", year=p[0], month=p[1], subreddit=subreddit, force=force)
def split(self, since: Union[str, int], until: Union[str, int, None], subreddit: str) -> None:
self._initialize_dates(since, until)
subreddit = subreddit.lower().strip()
logging.info(
f"Splitting monthly '{subreddit}' submission files from {self._get_date_range_str()} into daily files"
)
for p in self.periods:
processing.split_extracted("RS", year=p[0], month=p[1], subreddit=subreddit)
def checksize(self, size_ratio=0.8) -> None:
verification.check_filesizes("RS", size_ratio)
def checkhash(self) -> None:
verification.check_filehashes("RS")
def list(self, downloaded: bool = True, extracted: bool = False) -> None:
verification.list_files("RS", downloaded, extracted)