From 54749a9e7aff121f09688208145d03c7fea0f906 Mon Sep 17 00:00:00 2001 From: zGadli Date: Sat, 28 Dec 2024 20:11:59 -0800 Subject: [PATCH 1/6] Fixed royal road watermark issue. #2531 --- sources/en/r/royalroad.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/sources/en/r/royalroad.py b/sources/en/r/royalroad.py index e5c9418f2..f293aa29d 100644 --- a/sources/en/r/royalroad.py +++ b/sources/en/r/royalroad.py @@ -77,15 +77,11 @@ def download_chapter_body(self, chapter): if possible_title and "Chapter" in possible_title.text: chapter["title"] = possible_title.text.strip() - classnames = [] - for style in soup.select("style"): - style = style.text.replace(" ", "").replace("\n", "") - if style.endswith("{display:none;speak:never;}"): - classnames.append(style[1:-27]) - - for classname in classnames: - for div in soup.find_all("p", {"class": classname}): - div.decompose() + chapter_contents = soup.select(".chapter-content") + for chapter_content in chapter_contents: + for html_tags in chapter_content.contents: + if html_tags.name == 'div': + html_tags.decompose() contents = soup.select_one(".chapter-content") self.cleaner.clean_contents(contents) From 211ee78347ba6ee76c3c27ef8de7f79a8b95c4d2 Mon Sep 17 00:00:00 2001 From: zGadli Date: Sun, 29 Dec 2024 15:46:27 -0800 Subject: [PATCH 2/6] Fix source syosetu --- sources/jp/s/syosetu.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py index cec560b49..05fb0c128 100644 --- a/sources/jp/s/syosetu.py +++ b/sources/jp/s/syosetu.py @@ -2,6 +2,8 @@ import logging from urllib.parse import quote_plus from lncrawl.core.crawler import Crawler +from concurrent.futures import ThreadPoolExecutor +from bs4 import element logger = logging.getLogger(__name__) search_url = "https://yomou.syosetu.com/search.php?word=%s" @@ -35,10 +37,10 @@ def search_novel(self, query): return results def read_novel_info(self): - self.init_parser('xml') + self.init_parser('lxml') soup = self.get_soup(self.novel_url) - self.novel_title = soup.select_one(".novel_title").text.strip() + self.novel_title = soup.select_one(".p-novel__title").text.strip() logger.debug('Novel title: %s', self.novel_title) # No novel cover. @@ -49,12 +51,13 @@ def read_novel_info(self): # Syosetu calls parts "chapters" soups = [] - pager_last = soup.select_one("a[class='novelview_pager-last']") + pager_last = soup.select_one(".c-pager__item--last") if pager_last and 'href' in pager_last.attrs: page_num = int(pager_last["href"].split("=")[-1]) - for x in range(1, page_num + 1): - soup = self.get_soup(f'{self.novel_url}?p={x}') - soups.append(soup) + with ThreadPoolExecutor() as executor: + futures = [executor.submit(self.get_soup, f'{self.novel_url}?p={x}') for x in range(1, page_num + 1)] + for future in futures: + soups.append(future.result()) else: soups.append(soup) @@ -62,16 +65,22 @@ def read_novel_info(self): chapter_id = 0 self.volumes.append({'id': 0}) for soup in soups: - for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"): - if 'chapter_title' in tag.attrs.get('class', ''): + # soup.contents[1].contents[3].contents[5].contents[1].contents[18].contents[14].contents + for tag in soup.select_one(".p-eplist"): + + if type(tag) is element.NavigableString: + continue + + if 'p-eplist__chapter-title'in tag.attrs.get('class', ''): # Part/volume (there might be none) volume_id += 1 self.volumes.append({ 'id': volume_id, 'title': tag.text.strip(), }) - elif tag.name == "a": + elif tag.select('a')[0]: # Chapter + tag = tag.select('a')[0] chapter_id += 1 self.chapters.append({ "id": chapter_id, @@ -82,6 +91,7 @@ def read_novel_info(self): def download_chapter_body(self, chapter): soup = self.get_soup(chapter["url"]) - contents = soup.select_one("#novel_honbun") + contents = soup.select_one(".p-novel__body") + logger.debug(f'Contents: {contents}') contents = self.cleaner.extract_contents(contents) return contents From c6e880bdd211ddb8b3fff4a97271733af0b9deb1 Mon Sep 17 00:00:00 2001 From: zGadli Date: Sun, 29 Dec 2024 15:51:50 -0800 Subject: [PATCH 3/6] Revert duplicate commit --- sources/en/r/royalroad.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sources/en/r/royalroad.py b/sources/en/r/royalroad.py index f293aa29d..e5c9418f2 100644 --- a/sources/en/r/royalroad.py +++ b/sources/en/r/royalroad.py @@ -77,11 +77,15 @@ def download_chapter_body(self, chapter): if possible_title and "Chapter" in possible_title.text: chapter["title"] = possible_title.text.strip() - chapter_contents = soup.select(".chapter-content") - for chapter_content in chapter_contents: - for html_tags in chapter_content.contents: - if html_tags.name == 'div': - html_tags.decompose() + classnames = [] + for style in soup.select("style"): + style = style.text.replace(" ", "").replace("\n", "") + if style.endswith("{display:none;speak:never;}"): + classnames.append(style[1:-27]) + + for classname in classnames: + for div in soup.find_all("p", {"class": classname}): + div.decompose() contents = soup.select_one(".chapter-content") self.cleaner.clean_contents(contents) From 37565b2547fd35a9a14f772eba95f83aca85f412 Mon Sep 17 00:00:00 2001 From: zGadli Date: Sun, 29 Dec 2024 15:55:14 -0800 Subject: [PATCH 4/6] remove comment --- sources/jp/s/syosetu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py index 05fb0c128..8586eacdb 100644 --- a/sources/jp/s/syosetu.py +++ b/sources/jp/s/syosetu.py @@ -65,7 +65,6 @@ def read_novel_info(self): chapter_id = 0 self.volumes.append({'id': 0}) for soup in soups: - # soup.contents[1].contents[3].contents[5].contents[1].contents[18].contents[14].contents for tag in soup.select_one(".p-eplist"): if type(tag) is element.NavigableString: From 6223356d151f206d82a076de6085f15518844ea0 Mon Sep 17 00:00:00 2001 From: zGadli Date: Sun, 29 Dec 2024 15:56:13 -0800 Subject: [PATCH 5/6] remove debug log --- sources/jp/s/syosetu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py index 8586eacdb..c8366e79a 100644 --- a/sources/jp/s/syosetu.py +++ b/sources/jp/s/syosetu.py @@ -91,6 +91,5 @@ def read_novel_info(self): def download_chapter_body(self, chapter): soup = self.get_soup(chapter["url"]) contents = soup.select_one(".p-novel__body") - logger.debug(f'Contents: {contents}') contents = self.cleaner.extract_contents(contents) return contents From 1ec4effaa5f519ae1e807ac5d836e1d570491e46 Mon Sep 17 00:00:00 2001 From: zGadli Date: Sun, 29 Dec 2024 16:39:50 -0800 Subject: [PATCH 6/6] fix format --- sources/jp/s/syosetu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py index c8366e79a..f06e25ab5 100644 --- a/sources/jp/s/syosetu.py +++ b/sources/jp/s/syosetu.py @@ -70,7 +70,7 @@ def read_novel_info(self): if type(tag) is element.NavigableString: continue - if 'p-eplist__chapter-title'in tag.attrs.get('class', ''): + if 'p-eplist__chapter-title' in tag.attrs.get('class', ''): # Part/volume (there might be none) volume_id += 1 self.volumes.append({