hasadna · TheFrok · Apr 1, 2020 · Apr 1, 2020 · Apr 6, 2020 · Apr 15, 2020
diff --git a/gtfs/docs/source/configuration_file.rst b/gtfs/docs/source/configuration_file.rst
@@ -15,9 +15,7 @@ Config Example
           "output": "output",
           "filtered_feeds": "filtered_feeds",
           "logs": "logs"
-        },
-        "output_file_name_regexp": "^(?P<date_str>[^_]+?)_(?P<type>\\w+)",
-        "output_file_type": "csv.gz"
+        }
       },
 
       "s3": {
@@ -27,8 +25,7 @@ Config Example
         "bucket_name": "obus-do2",
       },
 
-      "use_data_from_today": false,
-      "date_range": ["2019-03-07", "2019-03-07"],
+      "date_range": ["2019-03-07"],
     }
 
 Parameters description

diff --git a/gtfs/gtfs_utils/gtfs_utils/config.schema.json b/gtfs/gtfs_utils/gtfs_utils/config.schema.json
@@ -28,6 +28,12 @@
         "type": "boolean",
         "default": false
       },
+      "skip_dates_with_output": {
+        "$$description": ["When true uses the output file regex to search for dates that already ",
+                          "has output files and skip the download and analysis of those dates"],
+        "type": "boolean",
+        "default": true
+      },
       "override_source_data_date": {
         "description": "If set use the data from the same date for all analyzed dates",
         "type": "string",
@@ -101,7 +107,10 @@
           "type": "string"
         },
         "output_file_name_regexp": {
-          "description": "A regular expression used to find existing output files.",
+          "$$description": ["A regular expression used to find existing output files. " ,
+                            "This is used to search for dates that already have output files",
+                            "The RegEx must use the named groups ``type`` for the output type (route_stats/trip_stats)",
+                            "and ``date_str`` for the date of the output."],
           "type": "string"
         },
         "output_file_type": {
@@ -138,7 +147,7 @@
           "additionalProperties": false
         }
       },
-      "required": ["base_directory", "child_directories", "output_file_name_regexp", "output_file_type"],
+      "required": ["base_directory", "child_directories"],
       "additionalProperties": false
     },
     "s3": {

diff --git a/gtfs/gtfs_utils/gtfs_utils/config_example.json b/gtfs/gtfs_utils/gtfs_utils/config_example.json
@@ -7,7 +7,7 @@
       "filtered_feeds": "filtered_feeds",
       "logs": "logs"
     },
-    "output_file_name_regexp": "^(?P<date_str>[^_]+?)_(?P<type>\\w+)",
+    "output_file_name_regexp": "^(?P<type>\\w+)_(?P<date_str>[^_]+?)",
     "output_file_type": "csv.gz"
   },
 

diff --git a/gtfs/gtfs_utils/gtfs_utils/configuration.py b/gtfs/gtfs_utils/gtfs_utils/configuration.py
@@ -43,8 +43,8 @@ def all(self) -> List[str]:
 class FilesConfiguration:
     base_directory: str = None
     child_directories: ChildDirectories = None
-    output_file_name_regexp: str = None
-    output_file_type: str = None
+    output_file_name_regexp: str = "^(?P<type>\\w+)_(?P<date_str>[^_]+?)"
+    output_file_type: str = "csv.gz"
 
     def __init__(self):
         self.__full_paths = None
@@ -73,8 +73,9 @@ class S3Configuration:
 class Configuration:
     files: FilesConfiguration = None
     s3: S3Configuration = None
-    use_data_from_today: bool = True
+    use_data_from_today: bool = False
     date_range: List[str] = field(default_factory=list)
+    skip_dates_with_output: bool = True
     override_source_data_date: str = ""
     max_gtfs_size_in_mb: int = sys.maxsize
     display_download_progress_bar: bool = True

diff --git a/gtfs/gtfs_utils/gtfs_utils/gtfs_stats.py b/gtfs/gtfs_utils/gtfs_utils/gtfs_stats.py
@@ -129,7 +129,12 @@ def batch_stats_s3(output_folder: str = None,
 
     try:
         os.makedirs(output_folder, exist_ok=True)
-        dates_without_output = get_dates_without_output(dates_to_analyze, output_folder)
+        if configuration.skip_dates_with_output:
+            dates_without_output = get_dates_without_output(dates_to_analyze, output_folder)
+            logging.info(f"Skipped {len(dates_to_analyze) - len(dates_without_output)} dates "
+                         f"that already had output files")
+        else:
+            dates_without_output = dates_to_analyze
 
         crud = S3Crud.from_configuration(configuration.s3)
         logging.info(f'Connected to S3 bucket {configuration.s3.bucket_name}')

diff --git a/gtfs/gtfs_utils/gtfs_utils/local_files.py b/gtfs/gtfs_utils/gtfs_utils/local_files.py
@@ -1,4 +1,5 @@
 import datetime
+import logging
 import re
 from os import listdir
 from os.path import split, join, exists
@@ -18,20 +19,44 @@ def _get_existing_output_files(output_folder: str) -> List[Tuple[datetime.date,
     configuration = load_configuration()
     file_name_re = configuration.files.output_file_name_regexp
     file_type_re = configuration.files.output_file_type.replace('.', '\\.')
-    regexp = file_name_re + '\\.' + file_type_re
+    regexp = re.compile(file_name_re + '\\.' + file_type_re)
 
     existing_output_files = []
 
     for file in listdir(output_folder):
         match = re.match(regexp, file)
         if match:
-            date_str, stats_type = match.groups()
-            file_type = (parse_conf_date_format(date_str), stats_type)
+            file_type = _parse_file_name_regex_match(match)
+            if file_type is None:
+                # return empty list if there was an error in one of the files
+                return []
             existing_output_files.append(file_type)
 
     return existing_output_files
 
 
+def _parse_file_name_regex_match(match: re.Match):
+    results = match.groupdict()
+    # validate that the regex used the correct group names
+    if ("type" not in results) or ("date_str" not in results):
+        # assume the order of the fields
+        stats_type, date_str = match.groups()
+        logging.info("The output file regex didn't use the correct group names: (type, date_str), "
+                     "for more information look in the configuration docs. trying unnamed groups")
+    else:
+        # regex has the correct groups
+        stats_type, date_str = results.get("type"), results.get("date_str")
+    try:
+        # try to parse the extracted date
+        parsed_date = parse_conf_date_format(date_str)
+    except ValueError:
+        logging.info(f'failed to parse date from file name, skipping the search. '
+                     f'the date was: {date_str!r}')
+        # skip on first failure
+        return None
+    return parsed_date, stats_type
+
+
 def get_dates_without_output(dates: List[datetime.date], output_folder: str) -> List[datetime.date]:
     """
     List dates without output files in the given folder (currently just route_stats is considered).