From 655e3c471c69d06bc57555ee15caa7383af05492 Mon Sep 17 00:00:00 2001 From: Vladas Maier Date: Thu, 27 Apr 2023 21:14:36 +0200 Subject: [PATCH] Refactor code - Add ChatGPT suggestions - Add threaded download - Add more prints for knowing what's going on --- .gitignore | 1 + scrapr.py | 204 ++++++++++++++++++++++++++++++++--------------------- 2 files changed, 126 insertions(+), 79 deletions(-) diff --git a/.gitignore b/.gitignore index 0b93ff8..c83da66 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,4 @@ dmypy.json *.iml *.ipr out/ +marvel-snap-cards diff --git a/scrapr.py b/scrapr.py index 814a234..04b4f1c 100644 --- a/scrapr.py +++ b/scrapr.py @@ -4,52 +4,56 @@ from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options +from threading import Thread +from datetime import datetime + +CARDS_API_URL = "http://localhost:8080/v1/cards" +MARVELSNAPZONE_URL = 'https://marvelsnapzone.com/cards' def scrap(): - url = 'https://marvelsnapzone.com/cards' + print("[%s] %s" % (datetime.now(), "Starting scraping ...")) chrome_options = Options() - chrome_options.headless = True + chrome_options.add_argument("--headless=new") chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-extensions') chrome_options.add_argument('--disable-gpu') browser = webdriver.Chrome(options=chrome_options) - browser.get(url) + browser.get(MARVELSNAPZONE_URL) html = browser.page_source soup = BeautifulSoup(html, 'html.parser') - # only look for link with a 'simple-card' class; those are the cards + + # Only look for link with a 'simple-card' class; those are the cards. links = soup.findAll('a', {'class': 'simple-card'}) characters = [] for link in links: character = { - # capitalize every word + # Capitalize every word. 'name': link['data-name'].title(), 'cost': link['data-cost'], 'power': link['data-power'], - # strip html tags and capitalize + # Strip HTML tags and capitalize. 'ability': capitalize(BeautifulSoup(link['data-ability'], 'html.parser').text), - # remove query string + # Remove query string. 'url': link['data-src'].split('?')[0], 'status': link['data-status'], 'source': link['data-source'] } characters.append(character) - # print(character) - - image_urls = [] - for character in characters: - image_urls.append(character['url']) + print("[%s] %s" % (datetime.now(), f"Found {character['name']}")) - # download_images(image_urls) + # TODO: uncomment to download card images. + image_urls = [character['url'] for character in characters] + download_images(image_urls) return characters def capitalize(text): - punc_filter = re.compile('([.!?;:]\s*)') - split_with_punctuation = punc_filter.split(text) + punctuation_filter = re.compile('([.!?;:]\s*)') + split_with_punctuation = punctuation_filter.split(text) for i, j in enumerate(split_with_punctuation): if len(j) > 1: split_with_punctuation[i] = j[0].upper() + j[1:] @@ -60,88 +64,130 @@ def capitalize(text): def download_images(urls, dir_name='marvel-snap-cards'): if not os.path.exists(dir_name): os.mkdir(dir_name) - print("Directory '", dir_name, "' created ") + print("[%s] %s" % (datetime.now(), f"Directory '{dir_name}' created.")) else: - print("Directory '", dir_name, "' already exists") + print("[%s] %s" % (datetime.now(), f"Directory '{dir_name}' already exists.")) + threads = [] for url in urls: + threads.append(Thread(target=download_image, args=(url, dir_name))) + threads[-1].start() + for thread in threads: + thread.join() + + print("[%s] %s" % (datetime.now(), f"Finished downloading. Check '{dir_name}' directory.")) + + +def download_image(url, dir_name): + print("[%s] %s" % (datetime.now(), f"Download image from {url}")) + try: response = requests.get(url) - if response.status_code: - # take the last part of the URL as file name - fp = open(dir_name + '/' + url.rsplit('/', 1)[-1], 'wb') - fp.write(response.content) - fp.close() + response.raise_for_status() + file_name = url.rsplit('/', 1)[-1] + file_path = os.path.join(dir_name, file_name) + with open(file_path, 'wb') as file: + file.write(response.content) + except requests.exceptions.RequestException as e: + print("[%s] %s" % (datetime.now(), f"Error downloading image from URL '{url}': {e}")) def create_cards(cards): - url = 'http://localhost:8080/v1/cards' for card in cards: - if card['status'] == 'released': - body = { - 'name': parse_name(card['name']), - 'cost': card['cost'], - 'power': card['power'], - 'ability': parse_ability(card['name'], card['ability']), - 'series': parse_source(card['source']), - 'imageUrl': card['url'] - } - requests.post(url, json=body) - print(body) - # print(card['name'] + ' created.') + if card["status"] != "released": + return + + body = { + "name": parse_name(card["name"]), + "cost": card["cost"], + "power": card["power"], + "ability": parse_ability(card["name"], card["ability"]), + "series": parse_source(card["source"]), + "imageUrl": card["url"], + } + + response = requests.post(CARDS_API_URL, json=body) + if response.status_code == requests.codes.created: + print("[%s] %s" % (datetime.now(), f"Created card: {card['name']}")) + else: + print("[%s] %s" % (datetime.now(), f"Failed to create card: {card['name']} - {response.text}")) def parse_name(name): - # character name manual fix - if name == 'Ant Man': - return 'Ant-Man' - elif name == 'Jane Foster Mighty Thor': - return 'Jane Foster The Mighty Thor' - elif name == 'Miles Morales': - return 'Miles Morales: Spider-Man' - elif name == 'Super-Skrull': - return 'Super Skrull' - return name + name = name.strip() + + name_mappings = { + "Ant Man": "Ant-Man", + "Jane Foster Mighty Thor": "Jane Foster The Mighty Thor", + "Miles Morales": "Miles Morales: Spider-Man", + "Super-Skrull": "Super Skrull", + } + + return name_mappings.get(name, name) def parse_ability(name, ability): - # provide 'No ability' instead of empty string - if ability == '': - ability = 'No ability' - # The Collector ability manual fix - if name == 'The Collector': - ability = 'When a card enters your hand from anywhere (except your deck), +1 power.' - # all following words should be shown in bold - bold_candidates = ["On Reveal", "Ongoing", "Widow's Bite", "Rock", "Rocks", "Doombot", "Squirrel", "Demon", "Drone", - "Mjolnir", "Tiger", "Limbo", "No ability"] + ability = ability.strip() + + # Provide 'No ability' instead of empty string. + if not ability: + ability = "No ability" + + # The Collector ability manual fix. + if name == "The Collector": + ability = "When a card enters your hand from anywhere (except your deck), +1 power." + + # All following words should be shown in bold. + bold_candidates = [ + "On Reveal", + "Ongoing", + "Widow's Bite", + "Rock", + "Rocks", + "Doombot", + "Squirrel", + "Demon", + "Drone", + "Mjolnir", + "Tiger", + "Limbo", + "No ability", + ] + for candidate in bold_candidates: - if ability.lower().__contains__(candidate.lower()): - ability = re.sub(candidate.lower(), "" + candidate + "", ability, - flags=re.IGNORECASE) - for i in ["1", "2", "3", "4", "5", "6", "7", "8", "9"]: - # +[1-9] should be shown in bold and green color - ability = re.sub("[+][" + i + "]", "+" + i + "", ability) - # -[1-9] should be shown in bold and red color - ability = re.sub("[-][" + i + "]", "-" + i + "", ability) - return ability + if candidate.lower() in ability.lower(): + ability = re.sub( + candidate.lower(), + f"{candidate}", + ability, + flags=re.IGNORECASE, + ) + + for i in range(1, 10): + # +[1-9] should be shown in bold and green color. + # -[1-9] should be shown in bold and red color. + ability = re.sub( + fr"[+][{i}]", + f"+{i}", + ability, + ) def parse_source(source): - if source.__contains__('Collection Level 1-14') or source.__contains__('Starter Card') or source.__contains__('Recruit Season'): - return 'Starter' - elif source.__contains__('Pool 1'): - return 'Series 1' - elif source.__contains__('Pool 2'): - return 'Series 2' - elif source.__contains__('Pool 3'): - return 'Series 3' - elif source.__contains__('Pool 4'): - return 'Series 4' - elif source.__contains__('Pool 5'): - return 'Series 5' - elif source.__contains__('Season Pass'): - return 'Season Pass' - else: - return '' + series_map = { + 'Collection Level 1-14': 'Starter', + 'Starter Card': 'Starter', + 'Recruit Season': 'Starter', + 'Pool 1': 'Series 1', + 'Pool 2': 'Series 2', + 'Pool 3': 'Series 3', + 'Pool 4': 'Series 4', + 'Pool 5': 'Series 5', + 'Season Pass': 'Season Pass' + } + for key in series_map: + if key in source: + return series_map[key] + return '' if __name__ == '__main__':