[
reaction_count
-integer
+Integer
|
-total reaction count of post
+Total reaction count of post
|
@@ -368,10 +381,10 @@ total reaction count of post
comments
-integer
+Integer
|
-comments count of post
+Comments count of post
|
@@ -380,10 +393,10 @@ comments count of post
content
- string
+ String
|
-content of post as text
+Content of post as text
|
@@ -392,7 +405,7 @@ content of post as text
video
- string
+ String
|
URL of video present in that post
@@ -405,10 +418,10 @@ URL of video present in that post
image
|
- list
+ List
|
-python's list containing URLs of all images present in the post
+List containing URLs of all images present in the post
|
@@ -417,10 +430,10 @@ python's list containing URLs of all images present in the post
posted_on
-datetime
+Datetime
|
-time at which post was posted(in ISO 8601 format)
+Time at which post was posted(in ISO 8601 format)
|
@@ -429,7 +442,7 @@ time at which post was posted(in ISO 8601 format)
post_url
-string
+String
|
URL for that post
@@ -449,9 +462,10 @@ URL for that post
Tech
This project uses different libraries to work properly.
diff --git a/changelog.MD b/changelog.MD
index 405eead..20794b3 100644
--- a/changelog.MD
+++ b/changelog.MD
@@ -1,88 +1,89 @@
Changelog
+
+
+3.0.0
+Added:
+Feature to control, whether to run the browser in headless or headful mode.
+Fixed:
+Update the outdated selector to the latest Facebook layout.
+
+
+
+
+2.0.2
+Fixed:
+Fix the README.md file not found error when installing PyPI
+
+
+
+
2.0.0
-Added
+Added:
Timeout argument to set the maximum amount of time the bot should run in case if no post were found.
-Changes
+Changes:
Updated selenium from version 3.141.0 to 4.1.0
-Fixed
+Fixed:
Fixed issue of browser keep on scrolling above despite calling scroll down method, happening due to different multiple functions call
+
+
0.1.10
-Added
+Added:
Support for new Facebook Layout
-
+
- Changelog
+
0.1.9
-Added
+Added:
Added feature for using proxy while scraping
-
+
-
+
0.1.8
-Fixed
+Fixed:
Fixed error "name element not found" occuring to some specific users.
Fixed CSV file missing posted_on column
+
-
-
+
+
0.1.7
-Fixed
+Fixed:
Only scrolling once while scraping, which created problem that only top few posts were scrapped. If user wants want more than 27 posts, it use to get strucked.
+
-
-
-
+
+
+
0.1.6
Added
Added feature to read long paragraph that lies inside "Continue reading" button. For e.g posts like this
Fixed
Scrolling down directly to the bottom of the page during on start leads to login page, now it scroll down to half of the page and closes the login popup
+
-
+
+
0.1.5
-Fixed
+Fixed:
Fixed feature to scrap id from different URL pattern
-
+
-
- 0.1.4
+
+
+ 0.1.4:
Added
Support for emoji and other different languages(other than english) in output. It is no longer shown as a Unicode string.
-
-
-
- Fixed
- Total reactions_count getting 0 as a output.
-
-
-
- 0.1.6
-Added
-Added feature to read long paragraph that lies inside "Continue reading" button. For e.g posts like this
-Fixed
-Scrolling down directly to the bottom of the page during on start leads to login page, now it scroll down to half of the page and closes the login popup
-
-
- 0.1.5
-Fixed
-Fixed feature to scrap id from different URL pattern
-
- 0.1.4
-Added
- Support for emoji and other different languages(other than english) in output. It is no longer shown as a Unicode string.
-
-
- Fixed
+ Fixed:
Total reactions_count getting 0 as a output.
diff --git a/facebook_page_scraper/__init__.py b/facebook_page_scraper/__init__.py
index 6f36dab..6b5d8bf 100644
--- a/facebook_page_scraper/__init__.py
+++ b/facebook_page_scraper/__init__.py
@@ -4,4 +4,5 @@
from .element_finder import Finder
from .scraping_utilities import Scraping_utilities
-__all__ = ["Initializer","Facebook_scraper","Utilities","Finder","Scraping_utilities"]
\ No newline at end of file
+__all__ = ["Initializer", "Facebook_scraper",
+ "Utilities", "Finder", "Scraping_utilities"]
diff --git a/facebook_page_scraper/driver_initialization.py b/facebook_page_scraper/driver_initialization.py
index 59d26e3..41eae71 100644
--- a/facebook_page_scraper/driver_initialization.py
+++ b/facebook_page_scraper/driver_initialization.py
@@ -13,14 +13,16 @@
class Initializer:
- def __init__(self, browser_name, proxy=None):
+ def __init__(self, browser_name, proxy=None, headless=True):
self.browser_name = browser_name
self.proxy = proxy
+ self.headless = headless
def set_properties(self, browser_option):
"""adds capabilities to the driver"""
- browser_option.add_argument(
- '--headless') # runs browser in headless mode
+ if self.headless:
+ browser_option.add_argument(
+ '--headless') # runs browser in headless mode
browser_option.add_argument('--no-sandbox')
browser_option.add_argument("--disable-dev-shm-usage")
browser_option.add_argument('--ignore-certificate-errors')
diff --git a/facebook_page_scraper/driver_utilities.py b/facebook_page_scraper/driver_utilities.py
index 1091288..8465b46 100644
--- a/facebook_page_scraper/driver_utilities.py
+++ b/facebook_page_scraper/driver_utilities.py
@@ -1,18 +1,17 @@
#!/usr/bin/env python3
-from fileinput import close
-
try:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
- from selenium.common.exceptions import NoSuchElementException,WebDriverException
+ from selenium.common.exceptions import NoSuchElementException, WebDriverException
from random import randint
from selenium.webdriver.common.keys import Keys
import sys
except Exception as ex:
print(ex)
+
class Utilities:
@staticmethod
@@ -30,58 +29,61 @@ def __close_error_popup(driver):
like "We could not process your request. Please try again later" ,
than click on close button to skip that popup.'''
try:
- WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.layerCancel'))) #wait for popup to show
- button = driver.find_element(By.CSS_SELECTOR,"a.layerCancel") #grab that popup's close button
- button.click() #click "close" button
+ WebDriverWait(driver, 10).until(EC.element_to_be_clickable(
+ (By.CSS_SELECTOR, 'a.layerCancel'))) # wait for popup to show
+ # grab that popup's close button
+ button = driver.find_element(By.CSS_SELECTOR, "a.layerCancel")
+ button.click() # click "close" button
except WebDriverException:
- #it is possible that even after waiting for given amount of time,modal may not appear
+ # it is possible that even after waiting for given amount of time,modal may not appear
pass
except NoSuchElementException:
- pass #passing this error silently because it may happen that popup never shows up
+ pass # passing this error silently because it may happen that popup never shows up
except Exception as ex:
- #if any other error occured except the above one
+ # if any other error occured except the above one
print("error at close_error_popup method : {}".format(ex))
@staticmethod
def __scroll_down_half(driver):
try:
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 2);")
+ driver.execute_script(
+ "window.scrollTo(0, document.body.scrollHeight / 2);")
except Exception as ex:
- #if any error occured than close the driver and exit
+ # if any error occured than close the driver and exit
Utilities.__close_driver(driver)
print("error at scroll_down_half method : {}".format(ex))
@staticmethod
def __close_modern_layout_signup_modal(driver):
- try:
- driver.execute_script(
- "window.scrollTo(0, document.body.scrollHeight);")
- close_button = driver.find_element(By.CSS_SELECTOR,'[aria-label="Close"]')
- close_button.click()
- except NoSuchElementException:
- pass
- except Exception as ex:
- print("error at close_modern_layout_signup_modal: {}".format(ex))
-
+ try:
+ driver.execute_script(
+ "window.scrollTo(0, document.body.scrollHeight);")
+ close_button = driver.find_element(
+ By.CSS_SELECTOR, '[aria-label="Close"]')
+ close_button.click()
+ except NoSuchElementException:
+ pass
+ except Exception as ex:
+ print("error at close_modern_layout_signup_modal: {}".format(ex))
@staticmethod
- def __scroll_down(driver,layout):
+ def __scroll_down(driver, layout):
"""expects driver's instance as a argument, and it scrolls down page to the most bottom till the height"""
try:
- if layout == "old":
- driver.execute_script(
- "window.scrollTo(0, document.body.scrollHeight);")
- elif layout == "new":
- body = driver.find_element(By.CSS_SELECTOR,"body")
- for _ in range(randint(5,6)):
- body.send_keys(Keys.PAGE_UP)
- for _ in range(randint(5, 8)):
- body.send_keys(Keys.PAGE_DOWN)
- #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- #Utilities.__close_modern_layout_signup_modal(driver)
+ if layout == "old":
+ driver.execute_script(
+ "window.scrollTo(0, document.body.scrollHeight);")
+ elif layout == "new":
+ body = driver.find_element(By.CSS_SELECTOR, "body")
+ for _ in range(randint(5, 6)):
+ body.send_keys(Keys.PAGE_UP)
+ for _ in range(randint(5, 8)):
+ body.send_keys(Keys.PAGE_DOWN)
+ #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ # Utilities.__close_modern_layout_signup_modal(driver)
except Exception as ex:
- #if any error occured than close the driver and exit
+ # if any error occured than close the driver and exit
Utilities.__close_driver(driver)
print("error at scroll_down method : {}".format(ex))
@@ -89,61 +91,65 @@ def __scroll_down(driver,layout):
def __close_popup(driver):
"""expects driver's instance and closes modal that ask for login, by clicking "Not Now" button """
try:
- #Utilities.__scroll_down_half(driver) #try to scroll
- #wait for popup to show
- WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,'expanding_cta_close_button')))
- #grab "Not Now" button
- popup_close_button = driver.find_element(By.ID,'expanding_cta_close_button')
- popup_close_button.click() #click the button
+ # Utilities.__scroll_down_half(driver) #try to scroll
+ # wait for popup to show
+ WebDriverWait(driver, 10).until(EC.element_to_be_clickable(
+ (By.ID, 'expanding_cta_close_button')))
+ # grab "Not Now" button
+ popup_close_button = driver.find_element(
+ By.ID, 'expanding_cta_close_button')
+ popup_close_button.click() # click the button
except WebDriverException:
- #modal may not popup, so no need to raise exception in case it is not found
+ # modal may not popup, so no need to raise exception in case it is not found
pass
except NoSuchElementException:
- pass #passing this exception silently as modal may not show up
+ pass # passing this exception silently as modal may not show up
except Exception as ex:
print("error at close_popup method : {}".format(ex))
@staticmethod
- def __wait_for_element_to_appear(driver,layout):
+ def __wait_for_element_to_appear(driver, layout):
"""expects driver's instance, wait for posts to show.
post's CSS class name is userContentWrapper
"""
try:
if layout == "old":
- #wait for page to load so posts are visible
- body = driver.find_element(By.CSS_SELECTOR,"body")
- for _ in range(randint(3, 5)):
- body.send_keys(Keys.PAGE_DOWN)
- WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.userContentWrapper')))
+ # wait for page to load so posts are visible
+ body = driver.find_element(By.CSS_SELECTOR, "body")
+ for _ in range(randint(3, 5)):
+ body.send_keys(Keys.PAGE_DOWN)
+ WebDriverWait(driver, 30).until(EC.presence_of_element_located(
+ (By.CSS_SELECTOR, '.userContentWrapper')))
elif layout == "new":
- WebDriverWait(driver, 30).until(
- EC.presence_of_element_located((By.CSS_SELECTOR, "[aria-posinset]")))
+ WebDriverWait(driver, 30).until(
+ EC.presence_of_element_located((By.CSS_SELECTOR, "[aria-posinset]")))
except WebDriverException:
- #if it was not found,it means either page is not loading or it does not exists
+ # if it was not found,it means either page is not loading or it does not exists
print("No posts were found!")
Utilities.__close_driver(driver)
- sys.exit(1) #exit the program, because if posts does not exists,we cannot go further
+ # exit the program, because if posts does not exists,we cannot go further
+ sys.exit(1)
except Exception as ex:
print("error at wait_for_element_to_appear method : {}".format(ex))
Utilities.__close_driver(driver)
-
-
@staticmethod
- def __click_see_more(driver,content):
+ def __click_see_more(driver, content):
"""expects driver's instance and selenium element, click on "see more" link to open hidden content"""
try:
- #find element and click 'see more' button
- element = content.find_element(By.CSS_SELECTOR,'span.see_more_link_inner')
- driver.execute_script("arguments[0].click();", element) #click button using js
+ # find element and click 'see more' button
+ element = content.find_element(
+ By.CSS_SELECTOR, 'span.see_more_link_inner')
+ # click button using js
+ driver.execute_script("arguments[0].click();", element)
except NoSuchElementException:
- #if it doesn't exists than no need to raise any error
+ # if it doesn't exists than no need to raise any error
pass
except AttributeError:
pass
except IndexError:
pass
except Exception as ex:
- print("error at click_see_more method : {}".format(ex))
\ No newline at end of file
+ print("error at click_see_more method : {}".format(ex))
diff --git a/facebook_page_scraper/element_finder.py b/facebook_page_scraper/element_finder.py
index 746c256..2467394 100644
--- a/facebook_page_scraper/element_finder.py
+++ b/facebook_page_scraper/element_finder.py
@@ -13,106 +13,111 @@
except Exception as ex:
print(ex)
+
class Finder():
"""
Holds the collections of methods that finds element of the facebook's posts using selenium's webdriver's methods
"""
@staticmethod
def __get_status_link(link_list):
- status = ""
- for link in link_list:
- link_value = link.get_attribute("href")
- if "/posts/" in link_value and "/groups/" in link_value:
- status = link
- break
- if "/posts/" in link_value:
- status = link
- break
- if "/videos/pcb" in link_value:
- status = link
- break
- elif "/photos/" in link_value:
- # print(link_value)
- status = link
- break
- if "fbid=" in link_value:
- status = link
- break
- elif "/group/" in link_value:
- # print(link_value)
- status = link
- break
- if "/videos/" in link_value:
- # print(link_value)
- status = link
- break
- elif "/groups/" in link_value:
- # print(link_value)
- status = link
- break
- return status
+ status = ""
+ for link in link_list:
+ link_value = link.get_attribute("href")
+ if "/posts/" in link_value and "/groups/" in link_value:
+ status = link
+ break
+ if "/posts/" in link_value:
+ status = link
+ break
+ if "/videos/pcb" in link_value:
+ status = link
+ break
+ elif "/photos/" in link_value:
+ # print(link_value)
+ status = link
+ break
+ if "fbid=" in link_value:
+ status = link
+ break
+ elif "/group/" in link_value:
+ # print(link_value)
+ status = link
+ break
+ if "/videos/" in link_value:
+ # print(link_value)
+ status = link
+ break
+ elif "/groups/" in link_value:
+ # print(link_value)
+ status = link
+ break
+ return status
@staticmethod
- def __find_status(post,layout):
+ def __find_status(post, layout):
"""finds URL of the post, then extracts link from that URL and returns it"""
try:
link = None
if layout == "old":
- #aim is to find element that looks like
- #after finding that element, get it's href value and pass it to different method that extracts post_id from that href
- status_link = post.find_element(By.CLASS_NAME,"_5pcq").get_attribute("href")
- #extract out post id from post's url
- status = Scraping_utilities._Scraping_utilities__extract_id_from_link(status_link)
+ # aim is to find element that looks like
+ # after finding that element, get it's href value and pass it to different method that extracts post_id from that href
+ status_link = post.find_element(
+ By.CLASS_NAME, "_5pcq").get_attribute("href")
+ # extract out post id from post's url
+ status = Scraping_utilities._Scraping_utilities__extract_id_from_link(
+ status_link)
elif layout == "new":
- #links = post.find_elements(By.CSS_SELECTOR,"a[role='link']")
- link = post.find_element(By.CSS_SELECTOR,'.gpro0wi8.b1v8xokw')
- status_link = link.get_attribute('href')
- status = Scraping_utilities._Scraping_utilities__extract_id_from_link(
- status_link)
+ #links = post.find_elements(By.CSS_SELECTOR,"a[role='link']")
+ link = post.find_element(By.CSS_SELECTOR, 'a[aria-label]')
+ status_link = link.get_attribute('href')
+ status = Scraping_utilities._Scraping_utilities__extract_id_from_link(
+ status_link)
except NoSuchElementException:
- #if element is not found
+ # if element is not found
status = "NA"
except Exception as ex:
print("error at find_status method : {}".format(ex))
status = "NA"
- return (status,status_link,link)
+ return (status, status_link, link)
@staticmethod
- def __find_share(post,layout):
+ def __find_share(post, layout):
"""finds shares count of the facebook post using selenium's webdriver's method"""
try:
if layout == "old":
- #aim is to find element that have datatest-id attribute as UFI2SharesCount/root
- shares = post.find_element(By.CSS_SELECTOR,"[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
- shares = Scraping_utilities._Scraping_utilities__extract_numbers(shares)
+ # aim is to find element that have datatest-id attribute as UFI2SharesCount/root
+ shares = post.find_element(
+ By.CSS_SELECTOR, "[data-testid='UFI2SharesCount/root']").get_attribute('textContent')
+ shares = Scraping_utilities._Scraping_utilities__extract_numbers(
+ shares)
elif layout == "new":
- elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
- shares = "0"
- for element in elements:
- text = element.text
- if "share" in text:
- shares = re.findall("\d+", text)[0]
- break
+ elements = post.find_elements(By.CSS_SELECTOR, "div.gtad4xkn")
+ shares = "0"
+ for element in elements:
+ text = element.text
+ if "share" in text:
+ shares = re.findall("\d+", text)[0]
+ break
return shares
except NoSuchElementException:
- #if element is not present that means there wasn't any shares
+ # if element is not present that means there wasn't any shares
shares = 0
except Exception as ex:
print("error at find_share method : {}".format(ex))
shares = 0
-
return shares
@staticmethod
def __find_reactions(post):
"""finds all reaction of the facebook post using selenium's webdriver's method"""
try:
- #find element that have attribute aria-label as 'See who reacted to this
- reactions_all = post.find_element(By.CSS_SELECTOR,'[aria-label="See who reacted to this"]')
+ # find element that have attribute aria-label as 'See who reacted to this
+ reactions_all = post.find_element(
+ By.CSS_SELECTOR, '[aria-label="See who reacted to this"]')
except NoSuchElementException:
reactions_all = ""
except Exception as ex:
@@ -120,22 +125,24 @@ def __find_reactions(post):
return reactions_all
@staticmethod
- def __find_comments(post,layout):
+ def __find_comments(post, layout):
"""finds comments count of the facebook post using selenium's webdriver's method"""
try:
comments = ""
if layout == "old":
- comments = post.find_element(By.CSS_SELECTOR,"a._3hg-").get_attribute('textContent')
- #extract numbers from text
- comments = Scraping_utilities._Scraping_utilities__extract_numbers(comments)
+ comments = post.find_element(
+ By.CSS_SELECTOR, "a._3hg-").get_attribute('textContent')
+ # extract numbers from text
+ comments = Scraping_utilities._Scraping_utilities__extract_numbers(
+ comments)
elif layout == "new":
- elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn")
- comments = "0"
- for element in elements:
- text = element.text
- if "comment" in text:
- comments = Scraping_utilities._Scraping_utilities__extract_numbers(
- comments)
+ elements = post.find_elements(By.CSS_SELECTOR, "div.gtad4xkn")
+ comments = "0"
+ for element in elements:
+ text = element.text
+ if "comment" in text:
+ comments = Scraping_utilities._Scraping_utilities__extract_numbers(
+ comments)
except NoSuchElementException:
comments = 0
except Exception as ex:
@@ -153,68 +160,74 @@ def __fetch_post_passage(href):
post_message_div_finder_regex = '(.*?)<\/div>'
- post_message = re.search(post_message_div_finder_regex,text)
+ post_message = re.search(post_message_div_finder_regex, text)
replace_html_tags_regex = '<[^<>]+>'
- message = re.sub(replace_html_tags_regex,'',post_message.group(0))
+ message = re.sub(replace_html_tags_regex, '', post_message.group(0))
return message
@staticmethod
- def __element_exists(element,css_selector):
+ def __element_exists(element, css_selector):
try:
- found = element.find_element(By.CSS_SELECTOR,css_selector)
+ found = element.find_element(By.CSS_SELECTOR, css_selector)
return True
except NoSuchElementException:
return False
@staticmethod
- def __find_content(post,driver,layout):
+ def __find_content(post, driver, layout):
"""finds content of the facebook post using selenium's webdriver's method and returns string containing text of the posts"""
try:
if layout == "old":
- post_content = post.find_element(By.CLASS_NAME,'userContent')
+ post_content = post.find_element(By.CLASS_NAME, 'userContent')
elif layout == "new":
- post_content = post.find_element(By.CSS_SELECTOR,'[data-ad-preview="message"]')
- #if 'See more' or 'Continue reading' is present in post
- if Finder._Finder__element_exists(post_content,"span.text_exposed_link > a"):
- element = post_content.find_element(By.CSS_SELECTOR,"span.text_exposed_link > a") #grab that element
- #if element have already the onclick function, that means it is expandable paragraph
+ post_content = post.find_element(
+ By.CSS_SELECTOR, '[data-ad-preview="message"]')
+ # if 'See more' or 'Continue reading' is present in post
+ if Finder._Finder__element_exists(post_content, "span.text_exposed_link > a"):
+ element = post_content.find_element(
+ By.CSS_SELECTOR, "span.text_exposed_link > a") # grab that element
+ # if element have already the onclick function, that means it is expandable paragraph
if element.get_attribute("onclick"):
- Utilities._Utilities__click_see_more(driver,post_content) #click 'see more' button to get hidden text as well
- content = Scraping_utilities._Scraping_utilities__extract_content(post_content) #extract content out of it
- elif element.get_attribute("target"): #if element have attribute of target="_blank"
- #if it does not have onclick() method, it means we'll to extract passage by request
- #if content have attribute target="_blank" it indicates that text will open in new tab,
- #so make a seperate request and get that text
- content = Finder._Finder__fetch_post_passage(element.get_attribute("href"))
+ # click 'see more' button to get hidden text as well
+ Utilities._Utilities__click_see_more(driver, post_content)
+ content = Scraping_utilities._Scraping_utilities__extract_content(
+ post_content) # extract content out of it
+ # if element have attribute of target="_blank"
+ elif element.get_attribute("target"):
+ # if it does not have onclick() method, it means we'll to extract passage by request
+ # if content have attribute target="_blank" it indicates that text will open in new tab,
+ # so make a seperate request and get that text
+ content = Finder._Finder__fetch_post_passage(
+ element.get_attribute("href"))
else:
- #if it does not have see more, just get the text out of it
+ # if it does not have see more, just get the text out of it
content = post_content.get_attribute("textContent")
except NoSuchElementException:
- #if [data-testid="post_message"] is not found, it means that post did not had any text,either it is image or video
+ # if [data-testid="post_message"] is not found, it means that post did not had any text,either it is image or video
content = ""
except Exception as ex:
print("error at find_content method : {}".format(ex))
content = ""
return content
-
@staticmethod
- def __find_posted_time(post,layout,link_element):
+ def __find_posted_time(post, layout, link_element):
"""finds posted time of the facebook post using selenium's webdriver's method"""
try:
- #extract element that looks like
+ # extract element that looks like
#posted_time = post.find_element_by_css_selector("abbr._5ptz").get_attribute("data-utime")
if layout == "old":
- posted_time = post.find_element(By.TAG_NAME,"abbr").get_attribute('data-utime')
- return datetime.datetime.fromtimestamp(float(posted_time)).isoformat()
+ posted_time = post.find_element(
+ By.TAG_NAME, "abbr").get_attribute('data-utime')
+ return datetime.datetime.fromtimestamp(float(posted_time)).isoformat()
elif layout == "new":
- aria_label_value = link_element.get_attribute("aria-label")
- timestamp = parse(aria_label_value).isoformat() if len(
- aria_label_value) > 5 else Scraping_utilities._Scraping_utilities__convert_to_iso(aria_label_value)
- return timestamp
+ aria_label_value = link_element.get_attribute("aria-label")
+ timestamp = parse(aria_label_value).isoformat() if len(
+ aria_label_value) > 5 else Scraping_utilities._Scraping_utilities__convert_to_iso(aria_label_value)
+ return timestamp
except dateutil.parser._parser.ParserError:
timestamp = Scraping_utilities._Scraping_utilities__convert_to_iso(
aria_label_value)
@@ -226,14 +239,14 @@ def __find_posted_time(post,layout,link_element):
timestamp = ""
return timestamp
-
@staticmethod
- def __find_video_url(post,page_name,status):
+ def __find_video_url(post, page_name, status):
"""finds video of the facebook post using selenium's webdriver's method"""
try:
- #if video is found in the post, than create a video URL by concatenating post's id with page_name
- video_element = post.find_element(By.TAG_NAME,"video")
- video = "https://www.facebook.com/{}/videos/{}".format(page_name,status)
+ # if video is found in the post, than create a video URL by concatenating post's id with page_name
+ video_element = post.find_element(By.TAG_NAME, "video")
+ video = "https://www.facebook.com/{}/videos/{}".format(
+ page_name, status)
except NoSuchElementException:
video = ""
@@ -248,10 +261,12 @@ def __find_video_url(post,page_name,status):
def __find_image_url(post):
"""finds all image of the facebook post using selenium's webdriver's method"""
try:
- #find all img tag that looks like ![]()
- images = post.find_elements(By.CSS_SELECTOR,"img.scaledImageFitWidth.img")
- #extract src attribute from all the img tag,store it in list
- sources = [image.get_attribute("src") for image in images] if len(images) > 0 else []
+ # find all img tag that looks like ![]()
+ images = post.find_elements(
+ By.CSS_SELECTOR, "img.scaledImageFitWidth.img")
+ # extract src attribute from all the img tag,store it in list
+ sources = [image.get_attribute("src") for image in images] if len(
+ images) > 0 else []
except NoSuchElementException:
sources = []
pass
@@ -262,18 +277,20 @@ def __find_image_url(post):
return sources
@staticmethod
- def __find_all_posts(driver,layout):
+ def __find_all_posts(driver, layout):
"""finds all posts of the facebook page using selenium's webdriver's method"""
try:
- #find all posts that looks like
+ # find all posts that looks like
if layout == "old":
- all_posts = driver.find_elements(By.CSS_SELECTOR,"div.userContentWrapper")
+ all_posts = driver.find_elements(
+ By.CSS_SELECTOR, "div.userContentWrapper")
elif layout == "new":
- all_posts = driver.find_elements(By.CSS_SELECTOR,'[aria-posinset]')
+ all_posts = driver.find_elements(
+ By.CSS_SELECTOR, '[aria-posinset]')
return all_posts
except NoSuchElementException:
print("Cannot find any posts! Exiting!")
- #if this fails to find posts that means, code cannot move forward, as no post is found
+ # if this fails to find posts that means, code cannot move forward, as no post is found
Utilities.__close_driver(driver)
sys.exit(1)
except Exception as ex:
@@ -282,40 +299,41 @@ def __find_all_posts(driver,layout):
sys.exit(1)
@staticmethod
- def __find_name(driver,layout):
+ def __find_name(driver, layout):
"""finds name of the facebook page using selenium's webdriver's method"""
try:
if layout == "old":
- name = driver.find_element(By.CSS_SELECTOR,'a._64-f').get_attribute('textContent')
+ name = driver.find_element(
+ By.CSS_SELECTOR, 'a._64-f').get_attribute('textContent')
elif layout == "new":
- name = driver.find_element(By.TAG_NAME,"strong").get_attribute("textContent")
+ name = driver.find_element(
+ By.TAG_NAME, "strong").get_attribute("textContent")
return name
except Exception as ex:
print("error at __find_name method : {}".format(ex))
@staticmethod
def __detect_ui(driver):
- try:
- driver.find_element(By.ID,"pagelet_bluebar")
- return "old"
- except NoSuchElementException:
- return "new"
- except Exception as ex:
- print("error art __detect_ui: {}".format(ex))
- Utilities.__close_driver(driver)
- sys.exit(1)
+ try:
+ driver.find_element(By.ID, "pagelet_bluebar")
+ return "old"
+ except NoSuchElementException:
+ return "new"
+ except Exception as ex:
+ print("error art __detect_ui: {}".format(ex))
+ Utilities.__close_driver(driver)
+ sys.exit(1)
@staticmethod
def __find_reaction(layout, reactions_all):
- try:
- if layout == "old":
- return reactions_all.find_elements(By.TAG_NAME,
- "a")
- elif layout == "new":
- return reactions_all.find_elements(By.TAG_NAME,
- "div")
-
- except Exception as ex:
- print("find_reaction", ex)
- return ""
+ try:
+ if layout == "old":
+ return reactions_all.find_elements(By.TAG_NAME,
+ "a")
+ elif layout == "new":
+ return reactions_all.find_elements(By.TAG_NAME,
+ "div")
+ except Exception as ex:
+ print("find_reaction", ex)
+ return ""
diff --git a/facebook_page_scraper/scraper.py b/facebook_page_scraper/scraper.py
index a06bc18..c90d5af 100644
--- a/facebook_page_scraper/scraper.py
+++ b/facebook_page_scraper/scraper.py
@@ -4,7 +4,6 @@
from .driver_utilities import Utilities
from .element_finder import Finder
from .scraping_utilities import Scraping_utilities
- from selenium.common.exceptions import NoSuchElementException
import json
import csv
import os
@@ -13,34 +12,33 @@
except Exception as ex:
print(ex)
+
class Facebook_scraper:
- __data_dict = {} #this dictionary stores all post's data
+ __data_dict = {} # this dictionary stores all post's data
- #when we scroll and extract all posts,it may happens that we extract same posts over and over,so this lead to too much iteration
- #and waste time to iterate over and over the same post, to solve that,
+ # when we scroll and extract all posts,it may happens that we extract same posts over and over,so this lead to too much iteration
+ # and waste time to iterate over and over the same post, to solve that,
# problem I needed a data structure which
# 1) removes duplicates from itself automatically,
# 2) provides search of element,
# 3) compatible with list's unpacking to quickly add element inside itself from list
# set() seems to be doing the work properly
- #__extracted_post contains all the post's ID that have been scraped before and as it set() it avoids post's ID duplication.
+ # __extracted_post contains all the post's ID that have been scraped before and as it set() it avoids post's ID duplication.
__extracted_post = set()
-
- #condition,
- #1) if we reach bottom of the page and post is not longer available, and we don't meet the number of posts that we need to find
- #2) if we were given wrong page_name, and it does not exists in fb than no post will exist.
- #with above condition being true, the crawler will keep on scrolling the page to find posts
+ # condition,
+ # 1) if we reach bottom of the page and post is not longer available, and we don't meet the number of posts that we need to find
+ # 2) if we were given wrong page_name, and it does not exists in fb than no post will exist.
+ # with above condition being true, the crawler will keep on scrolling the page to find posts
# and it will stuck in infinite loop, which may cause machine to crash
- #to solve the problem, I have declared a class member "retry",assigned it value 10.
- #it checks 'retry' times if posts does not exists.
+ # to solve the problem, I have declared a class member "retry",assigned it value 10.
+ # it checks 'retry' times if posts does not exists.
# __no_post_found method subtracts -1 every time the if post is not found.
- #on each iteration __close_after_retry is called to check if retry have turned to 0
+ # on each iteration __close_after_retry is called to check if retry have turned to 0
# if it returns true,it will break the loop. After coming out of loop,driver will be closed and it will return post whatever was found
-
- def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None,timeout=600):
+ def __init__(self, page_name, posts_count=10, browser="chrome", proxy=None, timeout=600, headless=True):
self.page_name = page_name
self.posts_count = int(posts_count)
#self.URL = "https://en-gb.facebook.com/pg/{}/posts".format(self.page_name)
@@ -50,214 +48,242 @@ def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None,timeout=6
self.proxy = proxy
self.__layout = ''
self.timeout = timeout
+ self.headless = headless
def __start_driver(self):
"""changes the class member __driver value to driver on call"""
- self.__driver = Initializer(self.browser,self.proxy).init()
- def __handle_popup(self,layout):
- #while scrolling, wait for login popup to show, it can be skipped by clicking "Not Now" button
+ self.__driver = Initializer(
+ self.browser, self.proxy, self.headless).init()
+
+ def __handle_popup(self, layout):
+ # while scrolling, wait for login popup to show, it can be skipped by clicking "Not Now" button
try:
- if layout == "old":
- #if during scrolling any of error or signup popup shows
- Utilities._Utilities__close_error_popup(self.__driver)
- Utilities._Utilities__close_popup(self.__driver)
- elif layout == "new":
- Utilities._Utilities__close_modern_layout_signup_modal(self.__driver)
+ if layout == "old":
+ # if during scrolling any of error or signup popup shows
+ Utilities._Utilities__close_error_popup(self.__driver)
+ Utilities._Utilities__close_popup(self.__driver)
+ elif layout == "new":
+ Utilities._Utilities__close_modern_layout_signup_modal(
+ self.__driver)
except Exception as ex:
- print(ex)
+ print(ex)
- def __check_timeout(self,start_time,current_time):
- return (current_time-start_time) > self.timeout
+ def __check_timeout(self, start_time, current_time):
+ return (current_time-start_time) > self.timeout
def scrap_to_json(self):
- #call the __start_driver and override class member __driver to webdriver's instance
+ # call the __start_driver and override class member __driver to webdriver's instance
self.__start_driver()
starting_time = time.time()
- #navigate to URL
+ # navigate to URL
self.__driver.get(self.URL)
self.__layout = Finder._Finder__detect_ui(self.__driver)
- #sometimes we get popup that says "your request couldn't be processed", however
- #posts are loading in background if popup is closed, so call this method in case if it pops up.
+ # sometimes we get popup that says "your request couldn't be processed", however
+ # posts are loading in background if popup is closed, so call this method in case if it pops up.
Utilities._Utilities__close_error_popup(self.__driver)
- #wait for post to load
- Utilities._Utilities__wait_for_element_to_appear(self.__driver,self.__layout)
- #scroll down to bottom most
- Utilities._Utilities__scroll_down(self.__driver,self.__layout)
+ # wait for post to load
+ Utilities._Utilities__wait_for_element_to_appear(
+ self.__driver, self.__layout)
+ # scroll down to bottom most
+ Utilities._Utilities__scroll_down(self.__driver, self.__layout)
self.__handle_popup(self.__layout)
-
- name = Finder._Finder__find_name(self.__driver,self.__layout) #find name element
+ name = Finder._Finder__find_name(
+ self.__driver, self.__layout) # find name element
while len(self.__data_dict) <= self.posts_count:
self.__handle_popup(self.__layout)
self.__find_elements(name)
current_time = time.time()
- if self.__check_timeout(starting_time,current_time) is True:
- print("Timeout...")
- break
- Utilities._Utilities__scroll_down(self.__driver, self.__layout) #scroll down
- #print(len(self.__data_dict))
- #close the browser window after job is done.
+ if self.__check_timeout(starting_time, current_time) is True:
+ print("Timeout...")
+ break
+ Utilities._Utilities__scroll_down(
+ self.__driver, self.__layout) # scroll down
+ # print(len(self.__data_dict))
+ # close the browser window after job is done.
Utilities._Utilities__close_driver(self.__driver)
- #dict trimming, might happen that we find more posts than it was asked, so just trim it
- self.__data_dict = dict(list(self.__data_dict.items())[0:int(self.posts_count)])
+ # dict trimming, might happen that we find more posts than it was asked, so just trim it
+ self.__data_dict = dict(list(self.__data_dict.items())[
+ 0:int(self.posts_count)])
- return json.dumps(self.__data_dict,ensure_ascii=False)
+ return json.dumps(self.__data_dict, ensure_ascii=False)
- def __json_to_csv(self,filename,json_data,directory):
+ def __json_to_csv(self, filename, json_data, directory):
- os.chdir(directory) #change working directory to given directory
- #headers of the CSV file
- fieldnames = ['id','name','shares','likes','loves','wow','cares','sad','angry','haha','reactions_count','comments',
- 'content','posted_on','video'
- ,'image','post_url']
- #open and start writing to CSV files
- with open("{}.csv".format(filename),'w',newline='',encoding="utf-8") as data_file:
- writer = csv.DictWriter(data_file,fieldnames=fieldnames) #instantiate DictWriter for writing CSV file
+ os.chdir(directory) # change working directory to given directory
+ # headers of the CSV file
+ fieldnames = ['id', 'name', 'shares', 'likes', 'loves', 'wow', 'cares', 'sad', 'angry', 'haha', 'reactions_count', 'comments',
+ 'content', 'posted_on', 'video', 'image', 'post_url']
+ # open and start writing to CSV files
+ with open("{}.csv".format(filename), 'w', newline='', encoding="utf-8") as data_file:
+ # instantiate DictWriter for writing CSV file
+ writer = csv.DictWriter(data_file, fieldnames=fieldnames)
- writer.writeheader() #write headers to CSV file
- #iterate over entire dictionary, write each posts as a row to CSV file
+ writer.writeheader() # write headers to CSV file
+ # iterate over entire dictionary, write each posts as a row to CSV file
for key in json_data:
- #parse post in a dictionary and write it as a single row
- row = {'id': key,'name' : json_data[key]['name'],'shares':json_data[key]['shares'],
- 'likes' : json_data[key]['reactions']['likes'],'loves' : json_data[key]['reactions']['loves'],
- 'wow' : json_data[key]['reactions']['wow'],'cares' : json_data[key]['reactions']['cares'],
- 'sad' : json_data[key]['reactions']['sad'],'angry' : json_data[key]['reactions']['angry'],
- 'haha' : json_data[key]['reactions']['haha'],'reactions_count' : json_data[key]['reaction_count'],
- 'comments' : json_data[key]['comments'],'content' : json_data[key]['content'],'posted_on' : json_data[key]['posted_on'],
- 'video' : json_data[key]['video'],'image': " ".join(json_data[key]['image'])
- ,'post_url' : json_data[key]['post_url']
- }
- writer.writerow(row) #write row to CSV file
-
- data_file.close() #after writing close the file
-
- def scrap_to_csv(self,filename,directory = os.getcwd()):
+ # parse post in a dictionary and write it as a single row
+ row = {'id': key, 'name': json_data[key]['name'], 'shares': json_data[key]['shares'],
+ 'likes': json_data[key]['reactions']['likes'], 'loves': json_data[key]['reactions']['loves'],
+ 'wow': json_data[key]['reactions']['wow'], 'cares': json_data[key]['reactions']['cares'],
+ 'sad': json_data[key]['reactions']['sad'], 'angry': json_data[key]['reactions']['angry'],
+ 'haha': json_data[key]['reactions']['haha'], 'reactions_count': json_data[key]['reaction_count'],
+ 'comments': json_data[key]['comments'], 'content': json_data[key]['content'], 'posted_on': json_data[key]['posted_on'],
+ 'video': json_data[key]['video'], 'image': " ".join(json_data[key]['image']), 'post_url': json_data[key]['post_url']
+ }
+ writer.writerow(row) # write row to CSV file
+
+ data_file.close() # after writing close the file
+
+ def scrap_to_csv(self, filename, directory=os.getcwd()):
try:
- data = self.scrap_to_json() #get the data in JSON format from the same class method
- self.__json_to_csv(filename,json.loads(data),directory) #convert it and write to CSV
+ data = self.scrap_to_json() # get the data in JSON format from the same class method
+ # convert it and write to CSV
+ self.__json_to_csv(filename, json.loads(data), directory)
return True
except Exception as ex:
print(ex)
return False
- def __remove_duplicates(self,all_posts):
+ def __remove_duplicates(self, all_posts):
"""takes a list of posts and removes duplicates from it and returns the list"""
- if len(self.__extracted_post) == 0: #if self.__extracted_post is empty that means it is first extraction
- self.__extracted_post.update(all_posts) #if it does than just add all the elements from the lists to __extracted_post set()
- return all_posts #return the all_posts without any changes as it is first time and no duplicate is present
+ if len(self.__extracted_post) == 0: # if self.__extracted_post is empty that means it is first extraction
+ # if it does than just add all the elements from the lists to __extracted_post set()
+ self.__extracted_post.update(all_posts)
+ return all_posts # return the all_posts without any changes as it is first time and no duplicate is present
else:
- #if self.extracted posts have some element than compare it with all_posts's element and return a new list containing new element
- removed_duplicated = [post for post in all_posts if post not in self.__extracted_post]
- self.__extracted_post.update(all_posts) #after removing duplicates, add all those new element to extracted_posts, as it
- return removed_duplicated #is set() it won't have duplicate elements
+ # if self.extracted posts have some element than compare it with all_posts's element and return a new list containing new element
+ removed_duplicated = [
+ post for post in all_posts if post not in self.__extracted_post]
+ # after removing duplicates, add all those new element to extracted_posts, as it
+ self.__extracted_post.update(all_posts)
+ return removed_duplicated # is set() it won't have duplicate elements
def __close_after_retry(self):
"""returns if class member retry is 0"""
return self.retry <= 0
-
- def __no_post_found(self,all_posts):
+ def __no_post_found(self, all_posts):
"""if all_posts were found to be length of 0"""
if len(all_posts) == 0:
- #if length of posts is 0,decrement retry by 1
+ # if length of posts is 0,decrement retry by 1
self.retry -= 1
- def __find_elements(self,name):
+ def __find_elements(self, name):
"""find elements of posts and add them to data_dict"""
- all_posts = Finder._Finder__find_all_posts(self.__driver,self.__layout) #find all posts
- all_posts = self.__remove_duplicates(all_posts) #remove duplicates from the list
+ all_posts = Finder._Finder__find_all_posts(
+ self.__driver, self.__layout) # find all posts
+ all_posts = self.__remove_duplicates(
+ all_posts) # remove duplicates from the list
- #iterate over all the posts and find details from the same
+ # iterate over all the posts and find details from the same
for post in all_posts:
try:
- #find post ID from post
- status,post_url,link_element = Finder._Finder__find_status(post,self.__layout)
- #find share from the post
- shares = Finder._Finder__find_share(post,self.__layout)
- #converting shares to number
- #e.g if 5k than it should be 5000
- shares = int(Scraping_utilities._Scraping_utilities__value_to_float(shares))
- #find all reactions
+ # find post ID from post
+ status, post_url, link_element = Finder._Finder__find_status(
+ post, self.__layout)
+ # find share from the post
+ shares = Finder._Finder__find_share(post, self.__layout)
+ # converting shares to number
+ # e.g if 5k than it should be 5000
+ shares = int(
+ Scraping_utilities._Scraping_utilities__value_to_float(shares))
+ # find all reactions
reactions_all = Finder._Finder__find_reactions(post)
- #find all anchor tags in reactions_all list
- all_hrefs_in_react = Finder._Finder__find_reaction(self.__layout,reactions_all,) if type(
+ # find all anchor tags in reactions_all list
+ all_hrefs_in_react = Finder._Finder__find_reaction(self.__layout, reactions_all,) if type(
reactions_all) != str else ""
- #if hrefs were found
- #all_hrefs contains elements like
- #["5 comments","54 Likes"] and so on
+ # if hrefs were found
+ # all_hrefs contains elements like
+ # ["5 comments","54 Likes"] and so on
if type(all_hrefs_in_react) == list:
- l = [i.get_attribute("aria-label") for i in all_hrefs_in_react]
+ l = [i.get_attribute("aria-label")
+ for i in all_hrefs_in_react]
else:
l = []
- #extract that aria-label from all_hrefs_in_react list and than extract number from them seperately
- #if Like aria-label is in the list, than extract it and extract numbers from that text
+ # extract that aria-label from all_hrefs_in_react list and than extract number from them seperately
+ # if Like aria-label is in the list, than extract it and extract numbers from that text
- likes = Scraping_utilities._Scraping_utilities__find_reaction_by_text(l,"Like")
+ likes = Scraping_utilities._Scraping_utilities__find_reaction_by_text(
+ l, "Like")
- #if Love aria-label is in the list, than extract it and extract numbers from that text
+ # if Love aria-label is in the list, than extract it and extract numbers from that text
loves = Scraping_utilities._Scraping_utilities__find_reaction_by_text(
l, "Love")
- #if Wow aria-label is in the list, than extract it and extract numbers from that text
+ # if Wow aria-label is in the list, than extract it and extract numbers from that text
wow = Scraping_utilities._Scraping_utilities__find_reaction_by_text(
l, "Wow")
- #if Care aria-label is in the list, than extract it and extract numbers from that text
- cares = Scraping_utilities._Scraping_utilities__find_reaction_by_text(l,"Care")
- #if Sad aria-label is in the list, than extract it and extract numbers from that text
- sad = Scraping_utilities._Scraping_utilities__find_reaction_by_text(l,"Sad")
- #if Angry aria-label is in the list, than extract it and extract numbers from that text
- angry = Scraping_utilities._Scraping_utilities__find_reaction_by_text(l,"Angry")
- #if Haha aria-label is in the list, than extract it and extract numbers from that text
+ # if Care aria-label is in the list, than extract it and extract numbers from that text
+ cares = Scraping_utilities._Scraping_utilities__find_reaction_by_text(
+ l, "Care")
+ # if Sad aria-label is in the list, than extract it and extract numbers from that text
+ sad = Scraping_utilities._Scraping_utilities__find_reaction_by_text(
+ l, "Sad")
+ # if Angry aria-label is in the list, than extract it and extract numbers from that text
+ angry = Scraping_utilities._Scraping_utilities__find_reaction_by_text(
+ l, "Angry")
+ # if Haha aria-label is in the list, than extract it and extract numbers from that text
haha = Scraping_utilities._Scraping_utilities__find_reaction_by_text(
l, "Haha")
- #converting all reactions to numbers
- #e,g reactions may contain counts like "5k","5m", so converting them to actual number
- likes = Scraping_utilities._Scraping_utilities__value_to_float(likes)
- loves = Scraping_utilities._Scraping_utilities__value_to_float(loves)
- wow = Scraping_utilities._Scraping_utilities__value_to_float(wow)
- cares = Scraping_utilities._Scraping_utilities__value_to_float(cares)
- sad = Scraping_utilities._Scraping_utilities__value_to_float(sad)
- angry = Scraping_utilities._Scraping_utilities__value_to_float(angry)
- haha = Scraping_utilities._Scraping_utilities__value_to_float(haha)
-
- reactions = {"likes" : int(likes),"loves" : int(loves),"wow":int(wow),"cares" : int(cares),"sad":int(sad),
- "angry":
- int(angry),"haha" : int(haha)}
-
- #count number of total reactions
- total_reaction_count = Scraping_utilities._Scraping_utilities__count_reaction(reactions)
-
- comments = Finder._Finder__find_comments(post,self.__layout)
- comments = int(Scraping_utilities._Scraping_utilities__value_to_float(comments))
- post_content = Finder._Finder__find_content(post,self.__driver,self.__layout)
- #extract time
- posted_time = Finder._Finder__find_posted_time(post,self.__layout,link_element)
-
- video = Finder._Finder__find_video_url(post,self.page_name,status)
+ # converting all reactions to numbers
+ # e,g reactions may contain counts like "5k","5m", so converting them to actual number
+ likes = Scraping_utilities._Scraping_utilities__value_to_float(
+ likes)
+ loves = Scraping_utilities._Scraping_utilities__value_to_float(
+ loves)
+ wow = Scraping_utilities._Scraping_utilities__value_to_float(
+ wow)
+ cares = Scraping_utilities._Scraping_utilities__value_to_float(
+ cares)
+ sad = Scraping_utilities._Scraping_utilities__value_to_float(
+ sad)
+ angry = Scraping_utilities._Scraping_utilities__value_to_float(
+ angry)
+ haha = Scraping_utilities._Scraping_utilities__value_to_float(
+ haha)
+
+ reactions = {"likes": int(likes), "loves": int(loves), "wow": int(wow), "cares": int(cares), "sad": int(sad),
+ "angry":
+ int(angry), "haha": int(haha)}
+
+ # count number of total reactions
+ total_reaction_count = Scraping_utilities._Scraping_utilities__count_reaction(
+ reactions)
+
+ comments = Finder._Finder__find_comments(post, self.__layout)
+ comments = int(
+ Scraping_utilities._Scraping_utilities__value_to_float(comments))
+ post_content = Finder._Finder__find_content(
+ post, self.__driver, self.__layout)
+ # extract time
+ posted_time = Finder._Finder__find_posted_time(
+ post, self.__layout, link_element)
+
+ video = Finder._Finder__find_video_url(
+ post, self.page_name, status)
image = Finder._Finder__find_image_url(post)
#post_url = "https://www.facebook.com/{}/posts/{}".format(self.page_name,status)
self.__data_dict[status] = {
- "name" : name,
- "shares" : shares,
- "reactions" : reactions,
- "reaction_count" : total_reaction_count,
- "comments" : comments,
- "content" : post_content,
- "posted_on" : posted_time,
- "video" : video,
- "image" : image,
- "post_url" :post_url
+ "name": name,
+ "shares": shares,
+ "reactions": reactions,
+ "reaction_count": total_reaction_count,
+ "comments": comments,
+ "content": post_content,
+ "posted_on": posted_time,
+ "video": video,
+ "image": image,
+ "post_url": post_url
}
except Exception as ex:
print("error at find_elements method : {}".format(ex))
-
-
diff --git a/facebook_page_scraper/scraping_utilities.py b/facebook_page_scraper/scraping_utilities.py
index 08c4da2..f388f06 100644
--- a/facebook_page_scraper/scraping_utilities.py
+++ b/facebook_page_scraper/scraping_utilities.py
@@ -3,9 +3,11 @@
from datetime import datetime as dt
import re
from datetime import datetime, timedelta
+ from selenium.webdriver.common.by import By
except Exception as ex:
print(ex)
+
class Scraping_utilities:
@staticmethod
def __extract_numbers(string):
@@ -13,14 +15,13 @@ def __extract_numbers(string):
e.g => input = '54454 comment', than output => 54454
"""
try:
- #return string.split(" ")[0]
- return re.findall("\d+",string)[0]
+ # return string.split(" ")[0]
+ return re.findall("\d+", string)[0]
except IndexError:
return 0
-
@staticmethod
- def __exists_in_list(li,word):
+ def __exists_in_list(li, word):
"""expects list and a element, returns all the occurence of element in the list.
e.g input => li = ['sajid','sajid','sajid','d','s'] with given word = 'sajid',
output => ['sajid','sajid','sajid'] """
@@ -37,7 +38,7 @@ def __convert_time(unix_timestamp):
def __extract_content(content):
"""returns the text content of selenium element, else if content is string than returns a empty string"""
if type(content) is not str:
- all_para = content.find_elements_by_tag_name("p")
+ all_para = content.find_elements(By.TAG_NAME, "p")
paragraph = ''
for para in all_para:
paragraph += para.get_attribute("textContent")
@@ -59,13 +60,13 @@ def __extract_id_from_link(link):
"""expects the post's URL as a argument, and extracts out post_id from that URL"""
try:
status = "NA"
- #if url pattern container "/posts"
+ # if url pattern container "/posts"
if "posts/" in link:
status = link.split('/')[5].split('?')[0]
- #if url pattern container "/photos"
+ # if url pattern container "/photos"
elif "photos/" in link:
status = link.split("/")[-2]
- #if url pattern container "/videos"
+ # if url pattern container "/videos"
if "/videos/" in link:
status = link.split("/")[5]
elif "fbid=" in link:
@@ -100,10 +101,10 @@ def __value_to_float(x):
@staticmethod
def __find_reaction_by_text(l, string):
- reaction = [substring for substring in l if string in substring]
- reaction = re.findall(
- "\d+", reaction[0])[0] if len(reaction) > 0 else "0"
- return reaction
+ reaction = [substring for substring in l if string in substring]
+ reaction = re.findall(
+ "\d+", reaction[0])[0] if len(reaction) > 0 else "0"
+ return reaction
@staticmethod
def __convert_to_iso(t):
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 2c84015..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-selenium==4.1.0
-webdriver-manager==3.2.2
-selenium-wire==4.3.1
-python-dateutil==2.8.2
\ No newline at end of file
diff --git a/setup.py b/setup.py
index fd39f6d..8e29f4d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,26 +1,27 @@
import setuptools
-with open("README.md","r") as file:
+with open("README.md", "r") as file:
long_description = file.read()
-requirements = []
+requirements = ['selenium==4.1.0',
+ 'webdriver-manager==3.2.2',
+ 'selenium-wire==4.3.1',
+ 'python-dateutil==2.8.2']
-for line in open("requirements.txt",'r',encoding="utf-8").readlines():
- requirements.append(line.replace("\n",""))
setuptools.setup(
- name = "facebook_page_scraper",
- version = "2.0.2",
- author = "Sajid Shaikh",
- author_email = "shaikhsajid3732@gmail.com",
- description = "Python package to scrap facebook's pages front end with no limitations",
- long_description = long_description,
- long_description_content_type = "text/markdown",
+ name="facebook_page_scraper",
+ version="3.0.0",
+ author="Sajid Shaikh",
+ author_email="shaikhsajid3732@gmail.com",
+ description="Python package to scrap facebook's pages front end with no limitations",
+ long_description=long_description,
+ long_description_content_type="text/markdown",
license="MIT",
- url = "https://github.com/shaikhsajid1111/facebook_page_scraper",
- keywords = "web-scraping selenium facebook facebook-pages",
- packages = setuptools.find_packages(),
- classifiers = [
+ url="https://github.com/shaikhsajid1111/facebook_page_scraper",
+ keywords="web-scraping selenium facebook facebook-pages",
+ packages=setuptools.find_packages(),
+ classifiers=[
"Development Status :: 4 - Beta",
"Environment :: Console",
"Intended Audience :: Developers",
@@ -40,6 +41,6 @@
"Topic :: Internet :: WWW/HTTP"
],
- python_requires = ">=3.6",
+ python_requires=">=3.6",
install_requires=requirements
)
|