Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug/skip date with output #314

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions gtfs/docs/source/configuration_file.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ Config Example
"output": "output",
"filtered_feeds": "filtered_feeds",
"logs": "logs"
},
"output_file_name_regexp": "^(?P<date_str>[^_]+?)_(?P<type>\\w+)",
"output_file_type": "csv.gz"
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you remove output_file_name_regexp? If you changed it to be optional, specify it in the schema

},

"s3": {
Expand All @@ -27,8 +25,7 @@ Config Example
"bucket_name": "obus-do2",
},

"use_data_from_today": false,
"date_range": ["2019-03-07", "2019-03-07"],
"date_range": ["2019-03-07"],
}

Parameters description
Expand Down
13 changes: 11 additions & 2 deletions gtfs/gtfs_utils/gtfs_utils/config.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@
"type": "boolean",
"default": false
},
"skip_dates_with_output": {
"$$description": ["When true uses the output file regex to search for dates that already ",
"has output files and skip the download and analysis of those dates"],
"type": "boolean",
"default": true
},
"override_source_data_date": {
"description": "If set use the data from the same date for all analyzed dates",
"type": "string",
Expand Down Expand Up @@ -101,7 +107,10 @@
"type": "string"
},
"output_file_name_regexp": {
"description": "A regular expression used to find existing output files.",
"$$description": ["A regular expression used to find existing output files. " ,
"This is used to search for dates that already have output files",
"The RegEx must use the named groups ``type`` for the output type (route_stats/trip_stats)",
"and ``date_str`` for the date of the output."],
"type": "string"
},
"output_file_type": {
Expand Down Expand Up @@ -138,7 +147,7 @@
"additionalProperties": false
}
},
"required": ["base_directory", "child_directories", "output_file_name_regexp", "output_file_type"],
"required": ["base_directory", "child_directories"],
"additionalProperties": false
},
"s3": {
Expand Down
2 changes: 1 addition & 1 deletion gtfs/gtfs_utils/gtfs_utils/config_example.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"filtered_feeds": "filtered_feeds",
"logs": "logs"
},
"output_file_name_regexp": "^(?P<date_str>[^_]+?)_(?P<type>\\w+)",
"output_file_name_regexp": "^(?P<type>\\w+)_(?P<date_str>[^_]+?)",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure about this change? Did you consult with @cjer?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you can see here, there is no option to change the format of the filenames:

date_str = date.strftime('%Y-%m-%')
trip_stats_output_path = join(output_folder, f'trip_stats_{date_str}.{output_file_type}')
route_stats_output_path = join(output_folder, f'route_stats_{date_str}.{output_file_type}')

"output_file_type": "csv.gz"
},

Expand Down
7 changes: 4 additions & 3 deletions gtfs/gtfs_utils/gtfs_utils/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def all(self) -> List[str]:
class FilesConfiguration:
base_directory: str = None
child_directories: ChildDirectories = None
output_file_name_regexp: str = None
output_file_type: str = None
output_file_name_regexp: str = "^(?P<type>\\w+)_(?P<date_str>[^_]+?)"
output_file_type: str = "csv.gz"

def __init__(self):
self.__full_paths = None
Expand Down Expand Up @@ -73,8 +73,9 @@ class S3Configuration:
class Configuration:
files: FilesConfiguration = None
s3: S3Configuration = None
use_data_from_today: bool = True
use_data_from_today: bool = False
date_range: List[str] = field(default_factory=list)
skip_dates_with_output: bool = True
override_source_data_date: str = ""
max_gtfs_size_in_mb: int = sys.maxsize
display_download_progress_bar: bool = True
Expand Down
7 changes: 6 additions & 1 deletion gtfs/gtfs_utils/gtfs_utils/gtfs_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,12 @@ def batch_stats_s3(output_folder: str = None,

try:
os.makedirs(output_folder, exist_ok=True)
dates_without_output = get_dates_without_output(dates_to_analyze, output_folder)
if configuration.skip_dates_with_output:
dates_without_output = get_dates_without_output(dates_to_analyze, output_folder)
logging.info(f"Skipped {len(dates_to_analyze) - len(dates_without_output)} dates "
f"that already had output files")
else:
dates_without_output = dates_to_analyze

crud = S3Crud.from_configuration(configuration.s3)
logging.info(f'Connected to S3 bucket {configuration.s3.bucket_name}')
Expand Down
31 changes: 28 additions & 3 deletions gtfs/gtfs_utils/gtfs_utils/local_files.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import logging
import re
from os import listdir
from os.path import split, join, exists
Expand All @@ -18,20 +19,44 @@ def _get_existing_output_files(output_folder: str) -> List[Tuple[datetime.date,
configuration = load_configuration()
file_name_re = configuration.files.output_file_name_regexp
file_type_re = configuration.files.output_file_type.replace('.', '\\.')
regexp = file_name_re + '\\.' + file_type_re
regexp = re.compile(file_name_re + '\\.' + file_type_re)

existing_output_files = []

for file in listdir(output_folder):
match = re.match(regexp, file)
if match:
date_str, stats_type = match.groups()
file_type = (parse_conf_date_format(date_str), stats_type)
file_type = _parse_file_name_regex_match(match)
if file_type is None:
# return empty list if there was an error in one of the files
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd guess we'd like to return only the found files. Why would we give up on all of the files if one failed?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought that it is indicating of something weird going on, but maybe you are right.
I can just log that and continue

return []
existing_output_files.append(file_type)

return existing_output_files


def _parse_file_name_regex_match(match: re.Match):
results = match.groupdict()
# validate that the regex used the correct group names
if ("type" not in results) or ("date_str" not in results):
# assume the order of the fields
stats_type, date_str = match.groups()
logging.info("The output file regex didn't use the correct group names: (type, date_str), "
"for more information look in the configuration docs. trying unnamed groups")
else:
# regex has the correct groups
stats_type, date_str = results.get("type"), results.get("date_str")
try:
# try to parse the extracted date
parsed_date = parse_conf_date_format(date_str)
except ValueError:
logging.info(f'failed to parse date from file name, skipping the search. '
f'the date was: {date_str!r}')
# skip on first failure
return None
return parsed_date, stats_type


def get_dates_without_output(dates: List[datetime.date], output_folder: str) -> List[datetime.date]:
"""
List dates without output files in the given folder (currently just route_stats is considered).
Expand Down