Skip to content

Commit

Permalink
Fetch real subtitle offset and default to 0 (#184)
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 authored Apr 16, 2024
1 parent 70549c6 commit 8719442
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 4 deletions.
23 changes: 22 additions & 1 deletion src/ted2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,9 +650,23 @@ def update_videos_list(
youtube_id,
length,
subtitles,
metadata_link,
):
# append to self.videos and return if not present
if not [video for video in self.videos if video.get("id", None) == video_id]:

# Fetch metadata and compute subtitles offset (sum up all domains durations
# up till the primary domain) - we do it only once per video since this
# information is same for all languages
subtitles_offset = 0
if metadata_link:
metadatas = request_url(metadata_link).json()
if "domains" in metadatas:
for domain in metadatas["domains"]:
if domain["primaryDomain"]:
break
subtitles_offset += int(domain["duration"] * 1000)

self.videos.append(
{
"id": video_id,
Expand All @@ -674,6 +688,7 @@ def update_videos_list(
"youtube_id": youtube_id,
"length": length,
"subtitles": subtitles,
"subtitles_offset": subtitles_offset,
}
)
logger.debug(f"Successfully inserted video {video_id} into video list")
Expand Down Expand Up @@ -776,6 +791,7 @@ def update_videos_list_from_info(self, json_data):
return False

langs = player_data["languages"]
metadata_link = player_data["resources"]["hls"]["metadata"]
subtitles = self.generate_subtitle_list(
video_id, langs, lang_code, native_talk_language
)
Expand All @@ -795,6 +811,7 @@ def update_videos_list_from_info(self, json_data):
youtube_id=youtube_id,
length=length,
subtitles=subtitles,
metadata_link=metadata_link,
)

def extract_info_from_video_page(
Expand Down Expand Up @@ -1188,10 +1205,14 @@ def download_subtitles(self, index, video):

# download subtitles
logger.debug(f"Downloading subtitles for {video['title'][0]['text']}")
if video["subtitles_offset"]:
logger.debug(f"Subtitles will be offset by {video['subtitles_offset']} ms")
valid_subs = []
for subtitle in video["subtitles"]:
time.sleep(0.5) # throttling
vtt_subtitle = WebVTT(subtitle["link"]).convert()
vtt_subtitle = WebVTT(subtitle["link"]).convert(
offset=video["subtitles_offset"]
)
if not vtt_subtitle:
logger.error(
f"Subtitle file for {subtitle['languageCode']} could not be created"
Expand Down
6 changes: 3 additions & 3 deletions src/ted2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class WebVTT:
def __init__(self, url):
self.url = url

def convert(self):
def convert(self, offset):
"""download and convert its URL to WebVTT text"""
req = request_url(self.url)

Expand All @@ -91,7 +91,7 @@ def convert(self):
except json.JSONDecodeError:
return None

return self.json_to_vtt(source_subtitles)
return self.json_to_vtt(source_subtitles, offset)

@staticmethod
def miliseconds_to_human(miliseconds):
Expand All @@ -105,7 +105,7 @@ def miliseconds_to_human(miliseconds):
return f"{hours:02}:{minutes:02}:{seconds:02}.{miliseconds:03}"

@staticmethod
def json_to_vtt(json_subtitles, offset=11820):
def json_to_vtt(json_subtitles, offset):
"""WebVTT string from TED JSON subtitles list
TED format: {"captions": [
Expand Down

0 comments on commit 8719442

Please sign in to comment.