Fetch real subtitle offset and default to 0 (#184)

openzim · Apr 16, 2024 · 8719442 · 8719442
1 parent 70549c6
commit 8719442
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 4 deletions.
diff --git a/src/ted2zim/scraper.py b/src/ted2zim/scraper.py
@@ -650,9 +650,23 @@ def update_videos_list(
         youtube_id,
         length,
         subtitles,
+        metadata_link,
     ):
         # append to self.videos and return if not present
         if not [video for video in self.videos if video.get("id", None) == video_id]:
+
+            # Fetch metadata and compute subtitles offset (sum up all domains durations
+            # up till the primary domain) - we do it only once per video since this
+            # information is same for all languages
+            subtitles_offset = 0
+            if metadata_link:
+                metadatas = request_url(metadata_link).json()
+                if "domains" in metadatas:
+                    for domain in metadatas["domains"]:
+                        if domain["primaryDomain"]:
+                            break
+                        subtitles_offset += int(domain["duration"] * 1000)
+
             self.videos.append(
                 {
                     "id": video_id,
@@ -674,6 +688,7 @@ def update_videos_list(
                     "youtube_id": youtube_id,
                     "length": length,
                     "subtitles": subtitles,
+                    "subtitles_offset": subtitles_offset,
                 }
             )
             logger.debug(f"Successfully inserted video {video_id} into video list")
@@ -776,6 +791,7 @@ def update_videos_list_from_info(self, json_data):
             return False
 
         langs = player_data["languages"]
+        metadata_link = player_data["resources"]["hls"]["metadata"]
         subtitles = self.generate_subtitle_list(
             video_id, langs, lang_code, native_talk_language
         )
@@ -795,6 +811,7 @@ def update_videos_list_from_info(self, json_data):
             youtube_id=youtube_id,
             length=length,
             subtitles=subtitles,
+            metadata_link=metadata_link,
         )
 
     def extract_info_from_video_page(
@@ -1188,10 +1205,14 @@ def download_subtitles(self, index, video):
 
         # download subtitles
         logger.debug(f"Downloading subtitles for {video['title'][0]['text']}")
+        if video["subtitles_offset"]:
+            logger.debug(f"Subtitles will be offset by {video['subtitles_offset']} ms")
         valid_subs = []
         for subtitle in video["subtitles"]:
             time.sleep(0.5)  # throttling
-            vtt_subtitle = WebVTT(subtitle["link"]).convert()
+            vtt_subtitle = WebVTT(subtitle["link"]).convert(
+                offset=video["subtitles_offset"]
+            )
             if not vtt_subtitle:
                 logger.error(
                     f"Subtitle file for {subtitle['languageCode']} could not be created"

diff --git a/src/ted2zim/utils.py b/src/ted2zim/utils.py
@@ -80,7 +80,7 @@ class WebVTT:
     def __init__(self, url):
         self.url = url
 
-    def convert(self):
+    def convert(self, offset):
         """download and convert its URL to WebVTT text"""
         req = request_url(self.url)
 
@@ -91,7 +91,7 @@ def convert(self):
         except json.JSONDecodeError:
             return None
 
-        return self.json_to_vtt(source_subtitles)
+        return self.json_to_vtt(source_subtitles, offset)
 
     @staticmethod
     def miliseconds_to_human(miliseconds):
@@ -105,7 +105,7 @@ def miliseconds_to_human(miliseconds):
         return f"{hours:02}:{minutes:02}:{seconds:02}.{miliseconds:03}"
 
     @staticmethod
-    def json_to_vtt(json_subtitles, offset=11820):
+    def json_to_vtt(json_subtitles, offset):
         """WebVTT string from TED JSON subtitles list
 
         TED format: {"captions": [