Skip to content

Commit

Permalink
Refactor code
Browse files Browse the repository at this point in the history
- Add ChatGPT suggestions
- Add threaded download
- Add more prints for knowing what's going on
  • Loading branch information
vlmaier committed Apr 27, 2023
1 parent 597d5f7 commit 655e3c4
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 79 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,4 @@ dmypy.json
*.iml
*.ipr
out/
marvel-snap-cards
204 changes: 125 additions & 79 deletions scrapr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,52 +4,56 @@
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from threading import Thread
from datetime import datetime

CARDS_API_URL = "http://localhost:8080/v1/cards"
MARVELSNAPZONE_URL = 'https://marvelsnapzone.com/cards'


def scrap():
url = 'https://marvelsnapzone.com/cards'
print("[%s] %s" % (datetime.now(), "Starting scraping ..."))

chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument("--headless=new")
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(options=chrome_options)
browser.get(url)
browser.get(MARVELSNAPZONE_URL)
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
# only look for link with a 'simple-card' class; those are the cards

# Only look for link with a 'simple-card' class; those are the cards.
links = soup.findAll('a', {'class': 'simple-card'})

characters = []
for link in links:
character = {
# capitalize every word
# Capitalize every word.
'name': link['data-name'].title(),
'cost': link['data-cost'],
'power': link['data-power'],
# strip html tags and capitalize
# Strip HTML tags and capitalize.
'ability': capitalize(BeautifulSoup(link['data-ability'], 'html.parser').text),
# remove query string
# Remove query string.
'url': link['data-src'].split('?')[0],
'status': link['data-status'],
'source': link['data-source']
}
characters.append(character)
# print(character)

image_urls = []
for character in characters:
image_urls.append(character['url'])
print("[%s] %s" % (datetime.now(), f"Found {character['name']}"))

# download_images(image_urls)
# TODO: uncomment to download card images.
image_urls = [character['url'] for character in characters]
download_images(image_urls)

return characters


def capitalize(text):
punc_filter = re.compile('([.!?;:]\s*)')
split_with_punctuation = punc_filter.split(text)
punctuation_filter = re.compile('([.!?;:]\s*)')
split_with_punctuation = punctuation_filter.split(text)
for i, j in enumerate(split_with_punctuation):
if len(j) > 1:
split_with_punctuation[i] = j[0].upper() + j[1:]
Expand All @@ -60,88 +64,130 @@ def capitalize(text):
def download_images(urls, dir_name='marvel-snap-cards'):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
print("Directory '", dir_name, "' created ")
print("[%s] %s" % (datetime.now(), f"Directory '{dir_name}' created."))
else:
print("Directory '", dir_name, "' already exists")
print("[%s] %s" % (datetime.now(), f"Directory '{dir_name}' already exists."))

threads = []
for url in urls:
threads.append(Thread(target=download_image, args=(url, dir_name)))
threads[-1].start()
for thread in threads:
thread.join()

print("[%s] %s" % (datetime.now(), f"Finished downloading. Check '{dir_name}' directory."))


def download_image(url, dir_name):
print("[%s] %s" % (datetime.now(), f"Download image from {url}"))
try:
response = requests.get(url)
if response.status_code:
# take the last part of the URL as file name
fp = open(dir_name + '/' + url.rsplit('/', 1)[-1], 'wb')
fp.write(response.content)
fp.close()
response.raise_for_status()
file_name = url.rsplit('/', 1)[-1]
file_path = os.path.join(dir_name, file_name)
with open(file_path, 'wb') as file:
file.write(response.content)
except requests.exceptions.RequestException as e:
print("[%s] %s" % (datetime.now(), f"Error downloading image from URL '{url}': {e}"))


def create_cards(cards):
url = 'http://localhost:8080/v1/cards'
for card in cards:
if card['status'] == 'released':
body = {
'name': parse_name(card['name']),
'cost': card['cost'],
'power': card['power'],
'ability': parse_ability(card['name'], card['ability']),
'series': parse_source(card['source']),
'imageUrl': card['url']
}
requests.post(url, json=body)
print(body)
# print(card['name'] + ' created.')
if card["status"] != "released":
return

body = {
"name": parse_name(card["name"]),
"cost": card["cost"],
"power": card["power"],
"ability": parse_ability(card["name"], card["ability"]),
"series": parse_source(card["source"]),
"imageUrl": card["url"],
}

response = requests.post(CARDS_API_URL, json=body)
if response.status_code == requests.codes.created:
print("[%s] %s" % (datetime.now(), f"Created card: {card['name']}"))
else:
print("[%s] %s" % (datetime.now(), f"Failed to create card: {card['name']} - {response.text}"))


def parse_name(name):
# character name manual fix
if name == 'Ant Man':
return 'Ant-Man'
elif name == 'Jane Foster Mighty Thor':
return 'Jane Foster The Mighty Thor'
elif name == 'Miles Morales':
return 'Miles Morales: Spider-Man'
elif name == 'Super-Skrull':
return 'Super Skrull'
return name
name = name.strip()

name_mappings = {
"Ant Man": "Ant-Man",
"Jane Foster Mighty Thor": "Jane Foster The Mighty Thor",
"Miles Morales": "Miles Morales: Spider-Man",
"Super-Skrull": "Super Skrull",
}

return name_mappings.get(name, name)


def parse_ability(name, ability):
# provide 'No ability' instead of empty string
if ability == '':
ability = 'No ability'
# The Collector ability manual fix
if name == 'The Collector':
ability = 'When a card enters your hand from anywhere (except your deck), +1 power.'
# all following words should be shown in bold
bold_candidates = ["On Reveal", "Ongoing", "Widow's Bite", "Rock", "Rocks", "Doombot", "Squirrel", "Demon", "Drone",
"Mjolnir", "Tiger", "Limbo", "No ability"]
ability = ability.strip()

# Provide 'No ability' instead of empty string.
if not ability:
ability = "No ability"

# The Collector ability manual fix.
if name == "The Collector":
ability = "When a card enters your hand from anywhere (except your deck), +1 power."

# All following words should be shown in bold.
bold_candidates = [
"On Reveal",
"Ongoing",
"Widow's Bite",
"Rock",
"Rocks",
"Doombot",
"Squirrel",
"Demon",
"Drone",
"Mjolnir",
"Tiger",
"Limbo",
"No ability",
]

for candidate in bold_candidates:
if ability.lower().__contains__(candidate.lower()):
ability = re.sub(candidate.lower(), "<span class='fw-bold'>" + candidate + "</span>", ability,
flags=re.IGNORECASE)
for i in ["1", "2", "3", "4", "5", "6", "7", "8", "9"]:
# +[1-9] should be shown in bold and green color
ability = re.sub("[+][" + i + "]", "<span class='fw-bold' style='color: green;'>+" + i + "</span>", ability)
# -[1-9] should be shown in bold and red color
ability = re.sub("[-][" + i + "]", "<span class='fw-bold' style='color: red;'>-" + i + "</span>", ability)
return ability
if candidate.lower() in ability.lower():
ability = re.sub(
candidate.lower(),
f"<span class='fw-bold'>{candidate}</span>",
ability,
flags=re.IGNORECASE,
)

for i in range(1, 10):
# +[1-9] should be shown in bold and green color.
# -[1-9] should be shown in bold and red color.
ability = re.sub(
fr"[+][{i}]",
f"<span class='fw-bold' style='color: green;'>+{i}</span>",
ability,
)


def parse_source(source):
if source.__contains__('Collection Level 1-14') or source.__contains__('Starter Card') or source.__contains__('Recruit Season'):
return 'Starter'
elif source.__contains__('Pool 1'):
return 'Series 1'
elif source.__contains__('Pool 2'):
return 'Series 2'
elif source.__contains__('Pool 3'):
return 'Series 3'
elif source.__contains__('Pool 4'):
return 'Series 4'
elif source.__contains__('Pool 5'):
return 'Series 5'
elif source.__contains__('Season Pass'):
return 'Season Pass'
else:
return ''
series_map = {
'Collection Level 1-14': 'Starter',
'Starter Card': 'Starter',
'Recruit Season': 'Starter',
'Pool 1': 'Series 1',
'Pool 2': 'Series 2',
'Pool 3': 'Series 3',
'Pool 4': 'Series 4',
'Pool 5': 'Series 5',
'Season Pass': 'Season Pass'
}
for key in series_map:
if key in source:
return series_map[key]
return ''


if __name__ == '__main__':
Expand Down

0 comments on commit 655e3c4

Please sign in to comment.