-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparsing.py
126 lines (91 loc) · 4.14 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pycurl
import certifi
import datetime
from io import BytesIO
from bs4 import BeautifulSoup
# Create a dictionary to map French month names to numbers
month_dict = {'janvier': 1, 'février': 2, 'mars': 3, 'avril': 4, 'mai': 5, 'juin': 6,
'juillet': 7, 'août': 8, 'septembre': 9, 'octobre': 10, 'novembre': 11, 'décembre': 12}
def convert_to_date(menu_string):
"""Convert a string to a date
Need month_dict to work
Args:
menu_string (str): The string to convert in French format (ex: "Mardi 12 janvier 2021")
Returns:
str: The date in ISO format (ex: "2021-01-12")
"""
# Split the string into words
words = menu_string.split()
return datetime.date(int(words[5]), month_dict[words[4].lower()], int(words[3]))
def get_html(URL):
"""Get the html code of the menu webpage
Returns:
str: The html code
"""
# Create a buffer to store the response
buffer = BytesIO()
# Create a pycurl object
c = pycurl.Curl()
# Set URL value
c.setopt(c.URL, URL)
# Write bytes that are utf-8 encoded
c.setopt(c.WRITEDATA, buffer)
# Set the User-Agent
c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
# Use certifi to verify the certificate
c.setopt(c.CAINFO, certifi.where())
# Perform a file transfer
c.perform()
# End curl session
c.close()
# Decode the buffer
body = buffer.getvalue().decode('utf-8')
return body
def parse_menu(htmlStr, location):
"""Parse the html code to get the menu
Args:
htmlStr (str): The html code
location (str): The location of the restaurant ("Illkirch", "Cronenbourg", "Paul-Appell", "Esplanade", "Gallia")
Returns:
str: The menu in JSON format
"""
menus = []
soup = BeautifulSoup(htmlStr, 'html.parser')
previous = False
i = 0
for time in soup.find_all('time', class_='menu_date_title'):
menu = {"title": time.text, "date": str(
convert_to_date(time.text))}
uls = soup.find_all('ul', class_='meal_foodies')
for meal_type in soup.find_all('div', class_='meal_title'):
cur_menu = meal_type.text
if previous:
if cur_menu != 'Dîner':
previous = False
break
previous = True
menu[cur_menu] = {}
for li in uls[i].find_all('li'):
if location == 'Illkirch' and li.text.startswith('SALLE DES ETUDIANTS - '):
pole = str(li).split('<ul')[0].split(' - ')[1]
menu[cur_menu][pole] = [li2.text.replace('\'', ' ') for li2 in li.find_all(
'li') if li2.text not in ['ou', 'Ou']]
elif location == 'Cronenbourg' and li.text.startswith(('Grillade', 'Plat du jour', 'Végétarien', 'Extension')):
pole = str(li).split('<ul')[0].split('>')[1]
menu[cur_menu][pole] = [li2.text.replace(
'\'', ' ') for li2 in li.find_all('li')]
elif location == 'Paul-Appell' and li.text.startswith(('Pôle végétal', 'Flam and Co', 'Plat du jour', 'Annexe')):
pole = str(li).split('<ul')[0].split('>')[1]
menu[cur_menu][pole] = [li2.text.replace('\'', ' ') for li2 in li.find_all(
'li') if li2.text not in ['ou', 'Ou']]
elif location == 'Gallia' and li.text.startswith(('MENU ETUDIANT', 'Végétarien')):
pole = str(li).split('<ul')[0].split('>')[1]
menu[cur_menu][pole] = [li2.text.replace('\'', ' ') for li2 in li.find_all(
'li') if li2.text not in ['ou', 'Ou']]
elif location == 'Esplanade' and li.text.startswith(('Menus', 'Desserts')):
pole = str(li).split('<ul')[0].split('>')[1]
menu[cur_menu][pole] = [li2.text.replace('\'', ' ') for li2 in li.find_all(
'li') if li2.text not in ['ou', 'Ou', '-']]
menus.append(menu)
i += 1
return menus