-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_mars.py
128 lines (91 loc) · 3.74 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Dependencies
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
def init_browser():
# @NOTE: Replace the path with your actual path to the chromedriver
executable_path = {'executable_path': "/Users/Marina/Desktop/chromedriver_win32/chromedriver"}
return Browser('chrome', **executable_path, headless=False)
def scrape():
browser = init_browser()
####################
#### Mars News ####
####################
### URL of NASA Mars News to be scrape ###
url_1 = "https://mars.nasa.gov/news"
browser.visit(url_1)
# Scrape page into Soup
html_1 = browser.html
soup = bs(html_1, 'html.parser')
# Retrieve all elements that contain news title
results = soup.find_all("div", class_="content_title")
# Get the lastest news title
news_title = results[1].get_text()
# Get the latest news paragrapgh
news_p = soup.find("div", class_="article_teaser_body").text
#########################
## Mars Featured Image ##
#########################
### URL of JPL Mars Space Images to be scrape ###
url_2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url_2)
# Search for "Full image" and "click"
browser.find_by_id('full_image').click()
# Search for "More info" and "click"
browser.links.find_by_partial_text('more info').click()
# Scrape page into Soup
html_2 = browser.html
soup = bs(html_2, 'html.parser')
# Search for image source
results = soup.find_all('figure', class_='lede')
relative_img_path = results[0].a['href']
featured_img = 'https://www.jpl.nasa.gov' + relative_img_path
###############################
######### Mars Facts #########
###############################
### URL of Mars Facts to be scrape ###
url_3 = "https://space-facts.com/mars/"
# Use Pandas to scrape data
tables = pd.read_html(url_3, match="Equatorial Diameter")[0]
# Rename table colunms
df = tables.rename(columns={0: " ", 1: " "})
# Convert the dataframe to html string
mars_facts_table = df.to_html(index=False, header=True, border=0)
### Mars Hemispheres
### URL of JPL Mars Space Images to be scrape ###
url_4 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
# HTML object
browser.visit(url_4)
# Store data in a list
hemisphere_image_urls = []
# Get a List of All the Hemispheres
links = browser.find_by_css("a.product-item h3")
for item in range(len(links)):
hemisphere = {}
# Find and click on each element on the list
browser.find_by_css("a.product-item h3")[item].click()
# Get Hemisphere Title
hemisphere["title"] = browser.find_by_css("h2.title").text
# Find Sample Image Anchor Tag, Extract <href> and store full img_url
sample_element = browser.links.find_by_text("Sample").first
hemisphere["img_url"] = sample_element["href"]
# Append Hemisphere Object to List
hemisphere_image_urls.append(hemisphere)
# Navigate Backwards
browser.back()
hemisphere_image_urls
##############################
# Store data in a dictionary #
##############################
mars_data = {
"news_title": news_title,
"news_paragraph": news_p,
"featured_image": featured_img,
"mars_facts": mars_facts_table,
"hemispheres": hemisphere_image_urls
}
# Close the browser after scraping
browser.quit()
# Return results
return mars_data