-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetIngredients.py
155 lines (138 loc) · 6.63 KB
/
getIngredients.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import sys
import json
import copy
import csv
# the layer-merge code is provided by the recipe1m team
def dspath(ext, ROOT, **kwargs):
return os.path.join(ROOT, ext)
class Layer(object):
L1 = 'recipe1M/layer1'
L2 = 'recipe1M/layer2'
L3 = 'recipe1M/layer3'
INGRS = 'recipe1M/det_ingrs'
GOODIES = 'goodies'
@staticmethod
def load(name, ROOT, **kwargs):
with open(dspath(name + '.json', ROOT, **kwargs)) as f_layer:
return json.load(f_layer)
@staticmethod
def merge(layers, ROOT, copy_base=False, **kwargs):
layers = [l if isinstance(l, list) else Layer.load(l, ROOT, **kwargs) for l in layers]
base = copy.deepcopy(layers[0]) if copy_base else layers[0]
entries_by_id = {entry['id']: entry for entry in base}
for layer in layers[1:]:
for entry in layer:
base_entry = entries_by_id.get(entry['id'])
if not base_entry:
continue
base_entry.update(entry)
return base
def getIngredient(key, i):
"""
getIngredients() is a function to retrieve the ingredients
according to my ingredient database ingredient.csv from
the one with the raw ingredients for each image.
If one of the 602 ingredients is found inside the raw ingredients
of the recipe, as retrieved from the web, it adds it to the list
of ingredients for the specific image.
Key is the element from my ingredient list ingredients.csv.
Though i is the raw ingredient taken from the layer.json files
which are retrieved from the websites.
This function checks if the keys exists inside the raw information.
I.e. key=tomato and i=small tomatoes then the key is added to the
ingredients list for this image. Because it checks if tomato or its
plural tomatoes exist in the raw info.
"""
key = key.replace("_"," ")
if (key in i):
ingr = key.split(" ")
ingr1 = i.split(" ")
length = len(ingr)
length1 = len(ingr1)
if ((key == i) or (''.join((key, 's')) == i) or (''.join((key, 'es')) == i)): # in case that is only one word the ingredient
if key not in listOfIngr:
listOfIngr.append(key)
elif ((length == 1) & (length1 > 1)): # in case the ingredient from the website has more than one words
for z in ingr1:
if ((z == key) or (''.join((key, 's')) == z)or (''.join((key, 'es')) == z)):
if key not in listOfIngr:
listOfIngr.append(key)
elif((length > 1) & (length1 == 1)):
for n in ingr:
if ((n == key) or (''.join((key, 's')) == n)or (''.join((key, 'es')) == n)):
if key not in listOfIngr:
listOfIngr.append(key)
elif ((length1 > 1) & (length > 1)): #in case both ingredients are more than one words
l = 0
for x in ingr1:
for y in ingr:
if ((x == y) or (''.join((x, 's')) == y)or (''.join((key, 'es')) == y)): # check if any words from the 2 strings are equal
l = l + 1
if (l == length):
if key not in listOfIngr:
# print (key)
listOfIngr.append(key)
if __name__ == "__main__":
listOfRecipes = []
recipeIngr = []
recipeIngredDict = {}
imageIngredDict = {}
errorCount = 0
listOfIDs = []
mylist = []
data_path = os.path.join(os.path.dirname(sys.argv[0]))
dataset = Layer.merge([Layer.L1, Layer.L2, Layer.INGRS], data_path) # the dataset is list, where each entry is a recipe. the recipe contains a dictionary
with open('recipe1M/ingredients-reduc.csv', mode='r') as infile:
reader = csv.reader(infile)
ingrsSet = {rows[0] for rows in reader}
# create the list with all the ingredients from the .csv file and sort it. Also create a same-size binary list
for i in ingrsSet:
i = i.replace("_", " ")
mylist.append(i)
mylist.sort()
Ingredients = {}
myDict = {}
number = 0
# start the loop for each entry in the original dataset
for d in dataset:
partition = d['partition']
if partition == "test": # CHANGE partition to "train", "test", "val" to retrieve the different sets
valid = d['valid']
id = d['id']
ingredients = d['ingredients']
for number,t in enumerate(ingredients):
if valid[number]:
ing = t['text']
recipeIngr.append(ing) # a list with the ingredients taken from the json file. Just the ingredients without the 'text' word
listOfIngr = []
# for every ingredient (key) in my dataset ingredients.csv (with the final 602 ingredients)
for key in ingrsSet:
for i in recipeIngr: # and for every i (ing taken from the json file)
getIngredient(key, i) # check if the key exists in the i (ing)
if len(listOfIngr) > 1: # if we have keys then
recipeIngredDict[ d['id']] = listOfIngr # create a dictionary [id:(listOfIngr)]
if 'images' in d: # if the recipe has images
image = d['images']
if len(image) >= 1:
for i in image:
imageUrl = i['url']
imageId = i['id']
binarymylist = []
finalDict = {}
for i in ingrsSet:
binarymylist.append(0)
for l in listOfIngr:
for number,ingr in enumerate(mylist):
if l == ingr:
binarymylist[number] = 1
finalDict[imageId] = [listOfIngr, binarymylist]
# this way it appends dictionaries, which afterwards could not be loaded from json
# in my model so I replaced }{ in notepad with , and I am using this new json for training
with open('recipe1M/datatest2.json', mode='a+') as f: # CHANGE json's name
json.dump(finalDict, f)
del binarymylist[:]
finalDict.clear()
else:
continue
del recipeIngr[:]