diff --git a/README.md b/README.md index c3d2200..57eac16 100644 --- a/README.md +++ b/README.md @@ -49,11 +49,12 @@ posts_count = 10 browser = "firefox" proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT timeout = 600 #600 seconds -meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy,timeout=timeout) +headless = True +meta_ai = Facebook_scraper(page_name, posts_count, browser, proxy=proxy, timeout=timeout, headless=headless) ``` -

Parameters for Facebook_scraper(page_name,posts_count,browser,proxy,timeout) class

+

Parameters for Facebook_scraper(page_name, posts_count, browser, proxy, timeout, headless) class

@@ -68,10 +69,10 @@ meta_ai = Facebook_scraper(page_name,posts_count,browser,proxy=proxy,timeout=tim page_name @@ -80,10 +81,10 @@ name of the facebook page posts_count @@ -92,10 +93,10 @@ number of posts to scrap, if not passed default is 10 browser @@ -104,10 +105,10 @@ which browser to use, either chrome or firefox. if not passed,default is chrome proxy(optional) @@ -115,13 +116,25 @@ optional argument, if user wants to set proxy, if proxy requires authentication timeout + + + + +
-string +String -name of the facebook page +Name of the facebook page
-integer +Integer -number of posts to scrap, if not passed default is 10 +Number of posts to scrap, if not passed default is 10
-string +String -which browser to use, either chrome or firefox. if not passed,default is chrome +Which browser to use, either chrome or firefox. if not passed,default is chrome
-string +String -optional argument, if user wants to set proxy, if proxy requires authentication then the format will be user:password@IP:PORT +Optional argument, if user wants to set proxy, if proxy requires authentication then the format will be user:password@IP:PORT
-integer +Integer The maximum amount of time the bot should run for. If not passed, the default timeout is set to 10 minutes
+headless + +Boolean + +Whether to run browser in headless mode?. Default is True + +

@@ -212,7 +225,7 @@ Output Structure for JSON format: filename = "data_file" #file name without CSV extension,where data will be saved directory = "E:\data" #directory where CSV file will be saved -meta_ai.scrap_to_csv(filename,directory) +meta_ai.scrap_to_csv(filename, directory) ``` @@ -228,7 +241,7 @@ id,name,shares,likes,loves,wow,cares,sad,angry,haha,reactions_count,comments,con

-

Parameters for scrap_to_csv(filename,directory) method.

+

Parameters for scrap_to_csv(filename, directory) method.

@@ -258,11 +271,11 @@ name of the CSV file where post's data will be saved directory @@ -305,7 +318,7 @@ Description id @@ -343,10 +356,10 @@ share count of post reactions @@ -355,10 +368,10 @@ dictionary containing reactions as keys and its count as value. Keys => [ reaction_count @@ -368,10 +381,10 @@ total reaction count of post comments @@ -380,10 +393,10 @@ comments count of post content @@ -392,7 +405,7 @@ content of post as text video @@ -417,10 +430,10 @@ python's list containing URLs of all images present in the post posted_on @@ -429,7 +442,7 @@ time at which post was posted(in ISO 8601 format) post_url
@@ -244,11 +257,11 @@ id,name,shares,likes,loves,wow,cares,sad,angry,haha,reactions_count,comments,con filename -string +String -name of the CSV file where post's data will be saved +Name of the CSV file where post's data will be saved
-string +String -directory where CSV file have to be stored. +Directory where CSV file have to be stored.
-string +String Post Identifier(integer casted inside string) @@ -319,7 +332,7 @@ Post Identifier(integer casted inside string) name -string +String Name of the page @@ -331,10 +344,10 @@ Name of the page shares -integer +Integer -share count of post +Share count of post
-dictionary +Dictionary -dictionary containing reactions as keys and its count as value. Keys => ["likes","loves","wow","cares","sad","angry","haha"] +Dictionary containing reactions as keys and its count as value. Keys => ["likes","loves","wow","cares","sad","angry","haha"]
-integer +Integer -total reaction count of post +Total reaction count of post
-integer +Integer -comments count of post +Comments count of post
- string + String -content of post as text +Content of post as text
- string + String URL of video present in that post @@ -405,10 +418,10 @@ URL of video present in that post image - list + List -python's list containing URLs of all images present in the post +List containing URLs of all images present in the post
-datetime +Datetime -time at which post was posted(in ISO 8601 format) +Time at which post was posted(in ISO 8601 format)
-string +String URL for that post @@ -449,9 +462,10 @@ URL for that post

Tech

This project uses different libraries to work properly.


diff --git a/changelog.MD b/changelog.MD index 405eead..20794b3 100644 --- a/changelog.MD +++ b/changelog.MD @@ -1,88 +1,89 @@

Changelog

+ +
+

3.0.0

+

Added:

+
  • Feature to control, whether to run the browser in headless or headful mode.
  • +

    Fixed:

    +
  • Update the outdated selector to the latest Facebook layout.
  • +
    +
    +
    +
    +

    2.0.2

    +

    Fixed:

    +
  • Fix the README.md file not found error when installing PyPI +
  • +
    +
    +

    2.0.0

    -

    Added

    +

    Added:

  • Timeout argument to set the maximum amount of time the bot should run in case if no post were found.
  • -

    Changes

    +

    Changes:

  • Updated selenium from version 3.141.0 to 4.1.0
  • -

    Fixed

    +

    Fixed:

  • Fixed issue of browser keep on scrolling above despite calling scroll down method, happening due to different multiple functions call
  • +

    +

    0.1.10

    -

    Added

    +

    Added:

  • Support for new Facebook Layout
  • - +

    -

    Changelog

    +

    0.1.9

    -

    Added

    +

    Added:

  • Added feature for using proxy while scraping
  • - +

    - +

    0.1.8

    -

    Fixed

    +

    Fixed:

  • Fixed error "name element not found" occuring to some specific users.
  • Fixed CSV file missing posted_on column
  • +

    - - +
    +

    0.1.7

    -

    Fixed

    +

    Fixed:

  • Only scrolling once while scraping, which created problem that only top few posts were scrapped. If user wants want more than 27 posts, it use to get strucked.
  • +
    -
    - - +
    +
    +

    0.1.6

    Added

  • Added feature to read long paragraph that lies inside "Continue reading" button. For e.g posts like this
  • Fixed

  • Scrolling down directly to the bottom of the page during on start leads to login page, now it scroll down to half of the page and closes the login popup
  • +

    - +
    +

    0.1.5

    -

    Fixed

    +

    Fixed:

  • Fixed feature to scrap id from different URL pattern
  • - +

    - -

    0.1.4

    +
    +
    +

    0.1.4:

    Added

  • Support for emoji and other different languages(other than english) in output. It is no longer shown as a Unicode string.
  • - -
    - -

    Fixed

    -
  • Total reactions_count getting 0 as a output.
  • -
    -
    - -

    0.1.6

    -

    Added

    -
  • Added feature to read long paragraph that lies inside "Continue reading" button. For e.g posts like this
  • -

    Fixed

    -
  • Scrolling down directly to the bottom of the page during on start leads to login page, now it scroll down to half of the page and closes the login popup
  • -
    - -

    0.1.5

    -

    Fixed

    -
  • Fixed feature to scrap id from different URL pattern
  • -
    -

    0.1.4

    -

    Added

    -
  • Support for emoji and other different languages(other than english) in output. It is no longer shown as a Unicode string.
  • - -
    -

    Fixed

    +

    Fixed:

  • Total reactions_count getting 0 as a output.
  • diff --git a/facebook_page_scraper/__init__.py b/facebook_page_scraper/__init__.py index 6f36dab..6b5d8bf 100644 --- a/facebook_page_scraper/__init__.py +++ b/facebook_page_scraper/__init__.py @@ -4,4 +4,5 @@ from .element_finder import Finder from .scraping_utilities import Scraping_utilities -__all__ = ["Initializer","Facebook_scraper","Utilities","Finder","Scraping_utilities"] \ No newline at end of file +__all__ = ["Initializer", "Facebook_scraper", + "Utilities", "Finder", "Scraping_utilities"] diff --git a/facebook_page_scraper/driver_initialization.py b/facebook_page_scraper/driver_initialization.py index 59d26e3..41eae71 100644 --- a/facebook_page_scraper/driver_initialization.py +++ b/facebook_page_scraper/driver_initialization.py @@ -13,14 +13,16 @@ class Initializer: - def __init__(self, browser_name, proxy=None): + def __init__(self, browser_name, proxy=None, headless=True): self.browser_name = browser_name self.proxy = proxy + self.headless = headless def set_properties(self, browser_option): """adds capabilities to the driver""" - browser_option.add_argument( - '--headless') # runs browser in headless mode + if self.headless: + browser_option.add_argument( + '--headless') # runs browser in headless mode browser_option.add_argument('--no-sandbox') browser_option.add_argument("--disable-dev-shm-usage") browser_option.add_argument('--ignore-certificate-errors') diff --git a/facebook_page_scraper/driver_utilities.py b/facebook_page_scraper/driver_utilities.py index 1091288..8465b46 100644 --- a/facebook_page_scraper/driver_utilities.py +++ b/facebook_page_scraper/driver_utilities.py @@ -1,18 +1,17 @@ #!/usr/bin/env python3 -from fileinput import close - try: from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By - from selenium.common.exceptions import NoSuchElementException,WebDriverException + from selenium.common.exceptions import NoSuchElementException, WebDriverException from random import randint from selenium.webdriver.common.keys import Keys import sys except Exception as ex: print(ex) + class Utilities: @staticmethod @@ -30,58 +29,61 @@ def __close_error_popup(driver): like "We could not process your request. Please try again later" , than click on close button to skip that popup.''' try: - WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.layerCancel'))) #wait for popup to show - button = driver.find_element(By.CSS_SELECTOR,"a.layerCancel") #grab that popup's close button - button.click() #click "close" button + WebDriverWait(driver, 10).until(EC.element_to_be_clickable( + (By.CSS_SELECTOR, 'a.layerCancel'))) # wait for popup to show + # grab that popup's close button + button = driver.find_element(By.CSS_SELECTOR, "a.layerCancel") + button.click() # click "close" button except WebDriverException: - #it is possible that even after waiting for given amount of time,modal may not appear + # it is possible that even after waiting for given amount of time,modal may not appear pass except NoSuchElementException: - pass #passing this error silently because it may happen that popup never shows up + pass # passing this error silently because it may happen that popup never shows up except Exception as ex: - #if any other error occured except the above one + # if any other error occured except the above one print("error at close_error_popup method : {}".format(ex)) @staticmethod def __scroll_down_half(driver): try: - driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 2);") + driver.execute_script( + "window.scrollTo(0, document.body.scrollHeight / 2);") except Exception as ex: - #if any error occured than close the driver and exit + # if any error occured than close the driver and exit Utilities.__close_driver(driver) print("error at scroll_down_half method : {}".format(ex)) @staticmethod def __close_modern_layout_signup_modal(driver): - try: - driver.execute_script( - "window.scrollTo(0, document.body.scrollHeight);") - close_button = driver.find_element(By.CSS_SELECTOR,'[aria-label="Close"]') - close_button.click() - except NoSuchElementException: - pass - except Exception as ex: - print("error at close_modern_layout_signup_modal: {}".format(ex)) - + try: + driver.execute_script( + "window.scrollTo(0, document.body.scrollHeight);") + close_button = driver.find_element( + By.CSS_SELECTOR, '[aria-label="Close"]') + close_button.click() + except NoSuchElementException: + pass + except Exception as ex: + print("error at close_modern_layout_signup_modal: {}".format(ex)) @staticmethod - def __scroll_down(driver,layout): + def __scroll_down(driver, layout): """expects driver's instance as a argument, and it scrolls down page to the most bottom till the height""" try: - if layout == "old": - driver.execute_script( - "window.scrollTo(0, document.body.scrollHeight);") - elif layout == "new": - body = driver.find_element(By.CSS_SELECTOR,"body") - for _ in range(randint(5,6)): - body.send_keys(Keys.PAGE_UP) - for _ in range(randint(5, 8)): - body.send_keys(Keys.PAGE_DOWN) - #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - #Utilities.__close_modern_layout_signup_modal(driver) + if layout == "old": + driver.execute_script( + "window.scrollTo(0, document.body.scrollHeight);") + elif layout == "new": + body = driver.find_element(By.CSS_SELECTOR, "body") + for _ in range(randint(5, 6)): + body.send_keys(Keys.PAGE_UP) + for _ in range(randint(5, 8)): + body.send_keys(Keys.PAGE_DOWN) + #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + # Utilities.__close_modern_layout_signup_modal(driver) except Exception as ex: - #if any error occured than close the driver and exit + # if any error occured than close the driver and exit Utilities.__close_driver(driver) print("error at scroll_down method : {}".format(ex)) @@ -89,61 +91,65 @@ def __scroll_down(driver,layout): def __close_popup(driver): """expects driver's instance and closes modal that ask for login, by clicking "Not Now" button """ try: - #Utilities.__scroll_down_half(driver) #try to scroll - #wait for popup to show - WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,'expanding_cta_close_button'))) - #grab "Not Now" button - popup_close_button = driver.find_element(By.ID,'expanding_cta_close_button') - popup_close_button.click() #click the button + # Utilities.__scroll_down_half(driver) #try to scroll + # wait for popup to show + WebDriverWait(driver, 10).until(EC.element_to_be_clickable( + (By.ID, 'expanding_cta_close_button'))) + # grab "Not Now" button + popup_close_button = driver.find_element( + By.ID, 'expanding_cta_close_button') + popup_close_button.click() # click the button except WebDriverException: - #modal may not popup, so no need to raise exception in case it is not found + # modal may not popup, so no need to raise exception in case it is not found pass except NoSuchElementException: - pass #passing this exception silently as modal may not show up + pass # passing this exception silently as modal may not show up except Exception as ex: print("error at close_popup method : {}".format(ex)) @staticmethod - def __wait_for_element_to_appear(driver,layout): + def __wait_for_element_to_appear(driver, layout): """expects driver's instance, wait for posts to show. post's CSS class name is userContentWrapper """ try: if layout == "old": - #wait for page to load so posts are visible - body = driver.find_element(By.CSS_SELECTOR,"body") - for _ in range(randint(3, 5)): - body.send_keys(Keys.PAGE_DOWN) - WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.userContentWrapper'))) + # wait for page to load so posts are visible + body = driver.find_element(By.CSS_SELECTOR, "body") + for _ in range(randint(3, 5)): + body.send_keys(Keys.PAGE_DOWN) + WebDriverWait(driver, 30).until(EC.presence_of_element_located( + (By.CSS_SELECTOR, '.userContentWrapper'))) elif layout == "new": - WebDriverWait(driver, 30).until( - EC.presence_of_element_located((By.CSS_SELECTOR, "[aria-posinset]"))) + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.CSS_SELECTOR, "[aria-posinset]"))) except WebDriverException: - #if it was not found,it means either page is not loading or it does not exists + # if it was not found,it means either page is not loading or it does not exists print("No posts were found!") Utilities.__close_driver(driver) - sys.exit(1) #exit the program, because if posts does not exists,we cannot go further + # exit the program, because if posts does not exists,we cannot go further + sys.exit(1) except Exception as ex: print("error at wait_for_element_to_appear method : {}".format(ex)) Utilities.__close_driver(driver) - - @staticmethod - def __click_see_more(driver,content): + def __click_see_more(driver, content): """expects driver's instance and selenium element, click on "see more" link to open hidden content""" try: - #find element and click 'see more' button - element = content.find_element(By.CSS_SELECTOR,'span.see_more_link_inner') - driver.execute_script("arguments[0].click();", element) #click button using js + # find element and click 'see more' button + element = content.find_element( + By.CSS_SELECTOR, 'span.see_more_link_inner') + # click button using js + driver.execute_script("arguments[0].click();", element) except NoSuchElementException: - #if it doesn't exists than no need to raise any error + # if it doesn't exists than no need to raise any error pass except AttributeError: pass except IndexError: pass except Exception as ex: - print("error at click_see_more method : {}".format(ex)) \ No newline at end of file + print("error at click_see_more method : {}".format(ex)) diff --git a/facebook_page_scraper/element_finder.py b/facebook_page_scraper/element_finder.py index 746c256..2467394 100644 --- a/facebook_page_scraper/element_finder.py +++ b/facebook_page_scraper/element_finder.py @@ -13,106 +13,111 @@ except Exception as ex: print(ex) + class Finder(): """ Holds the collections of methods that finds element of the facebook's posts using selenium's webdriver's methods """ @staticmethod def __get_status_link(link_list): - status = "" - for link in link_list: - link_value = link.get_attribute("href") - if "/posts/" in link_value and "/groups/" in link_value: - status = link - break - if "/posts/" in link_value: - status = link - break - if "/videos/pcb" in link_value: - status = link - break - elif "/photos/" in link_value: - # print(link_value) - status = link - break - if "fbid=" in link_value: - status = link - break - elif "/group/" in link_value: - # print(link_value) - status = link - break - if "/videos/" in link_value: - # print(link_value) - status = link - break - elif "/groups/" in link_value: - # print(link_value) - status = link - break - return status + status = "" + for link in link_list: + link_value = link.get_attribute("href") + if "/posts/" in link_value and "/groups/" in link_value: + status = link + break + if "/posts/" in link_value: + status = link + break + if "/videos/pcb" in link_value: + status = link + break + elif "/photos/" in link_value: + # print(link_value) + status = link + break + if "fbid=" in link_value: + status = link + break + elif "/group/" in link_value: + # print(link_value) + status = link + break + if "/videos/" in link_value: + # print(link_value) + status = link + break + elif "/groups/" in link_value: + # print(link_value) + status = link + break + return status @staticmethod - def __find_status(post,layout): + def __find_status(post, layout): """finds URL of the post, then extracts link from that URL and returns it""" try: link = None if layout == "old": - #aim is to find element that looks like - #after finding that element, get it's href value and pass it to different method that extracts post_id from that href - status_link = post.find_element(By.CLASS_NAME,"_5pcq").get_attribute("href") - #extract out post id from post's url - status = Scraping_utilities._Scraping_utilities__extract_id_from_link(status_link) + # aim is to find element that looks like + # after finding that element, get it's href value and pass it to different method that extracts post_id from that href + status_link = post.find_element( + By.CLASS_NAME, "_5pcq").get_attribute("href") + # extract out post id from post's url + status = Scraping_utilities._Scraping_utilities__extract_id_from_link( + status_link) elif layout == "new": - #links = post.find_elements(By.CSS_SELECTOR,"a[role='link']") - link = post.find_element(By.CSS_SELECTOR,'.gpro0wi8.b1v8xokw') - status_link = link.get_attribute('href') - status = Scraping_utilities._Scraping_utilities__extract_id_from_link( - status_link) + #links = post.find_elements(By.CSS_SELECTOR,"a[role='link']") + link = post.find_element(By.CSS_SELECTOR, 'a[aria-label]') + status_link = link.get_attribute('href') + status = Scraping_utilities._Scraping_utilities__extract_id_from_link( + status_link) except NoSuchElementException: - #if element is not found + # if element is not found status = "NA" except Exception as ex: print("error at find_status method : {}".format(ex)) status = "NA" - return (status,status_link,link) + return (status, status_link, link) @staticmethod - def __find_share(post,layout): + def __find_share(post, layout): """finds shares count of the facebook post using selenium's webdriver's method""" try: if layout == "old": - #aim is to find element that have datatest-id attribute as UFI2SharesCount/root - shares = post.find_element(By.CSS_SELECTOR,"[data-testid='UFI2SharesCount/root']").get_attribute('textContent') - shares = Scraping_utilities._Scraping_utilities__extract_numbers(shares) + # aim is to find element that have datatest-id attribute as UFI2SharesCount/root + shares = post.find_element( + By.CSS_SELECTOR, "[data-testid='UFI2SharesCount/root']").get_attribute('textContent') + shares = Scraping_utilities._Scraping_utilities__extract_numbers( + shares) elif layout == "new": - elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn") - shares = "0" - for element in elements: - text = element.text - if "share" in text: - shares = re.findall("\d+", text)[0] - break + elements = post.find_elements(By.CSS_SELECTOR, "div.gtad4xkn") + shares = "0" + for element in elements: + text = element.text + if "share" in text: + shares = re.findall("\d+", text)[0] + break return shares except NoSuchElementException: - #if element is not present that means there wasn't any shares + # if element is not present that means there wasn't any shares shares = 0 except Exception as ex: print("error at find_share method : {}".format(ex)) shares = 0 - return shares @staticmethod def __find_reactions(post): """finds all reaction of the facebook post using selenium's webdriver's method""" try: - #find element that have attribute aria-label as 'See who reacted to this - reactions_all = post.find_element(By.CSS_SELECTOR,'[aria-label="See who reacted to this"]') + # find element that have attribute aria-label as 'See who reacted to this + reactions_all = post.find_element( + By.CSS_SELECTOR, '[aria-label="See who reacted to this"]') except NoSuchElementException: reactions_all = "" except Exception as ex: @@ -120,22 +125,24 @@ def __find_reactions(post): return reactions_all @staticmethod - def __find_comments(post,layout): + def __find_comments(post, layout): """finds comments count of the facebook post using selenium's webdriver's method""" try: comments = "" if layout == "old": - comments = post.find_element(By.CSS_SELECTOR,"a._3hg-").get_attribute('textContent') - #extract numbers from text - comments = Scraping_utilities._Scraping_utilities__extract_numbers(comments) + comments = post.find_element( + By.CSS_SELECTOR, "a._3hg-").get_attribute('textContent') + # extract numbers from text + comments = Scraping_utilities._Scraping_utilities__extract_numbers( + comments) elif layout == "new": - elements = post.find_elements(By.CSS_SELECTOR,"div.gtad4xkn") - comments = "0" - for element in elements: - text = element.text - if "comment" in text: - comments = Scraping_utilities._Scraping_utilities__extract_numbers( - comments) + elements = post.find_elements(By.CSS_SELECTOR, "div.gtad4xkn") + comments = "0" + for element in elements: + text = element.text + if "comment" in text: + comments = Scraping_utilities._Scraping_utilities__extract_numbers( + comments) except NoSuchElementException: comments = 0 except Exception as ex: @@ -153,68 +160,74 @@ def __fetch_post_passage(href): post_message_div_finder_regex = '
    (.*?)<\/div>' - post_message = re.search(post_message_div_finder_regex,text) + post_message = re.search(post_message_div_finder_regex, text) replace_html_tags_regex = '<[^<>]+>' - message = re.sub(replace_html_tags_regex,'',post_message.group(0)) + message = re.sub(replace_html_tags_regex, '', post_message.group(0)) return message @staticmethod - def __element_exists(element,css_selector): + def __element_exists(element, css_selector): try: - found = element.find_element(By.CSS_SELECTOR,css_selector) + found = element.find_element(By.CSS_SELECTOR, css_selector) return True except NoSuchElementException: return False @staticmethod - def __find_content(post,driver,layout): + def __find_content(post, driver, layout): """finds content of the facebook post using selenium's webdriver's method and returns string containing text of the posts""" try: if layout == "old": - post_content = post.find_element(By.CLASS_NAME,'userContent') + post_content = post.find_element(By.CLASS_NAME, 'userContent') elif layout == "new": - post_content = post.find_element(By.CSS_SELECTOR,'[data-ad-preview="message"]') - #if 'See more' or 'Continue reading' is present in post - if Finder._Finder__element_exists(post_content,"span.text_exposed_link > a"): - element = post_content.find_element(By.CSS_SELECTOR,"span.text_exposed_link > a") #grab that element - #if element have already the onclick function, that means it is expandable paragraph + post_content = post.find_element( + By.CSS_SELECTOR, '[data-ad-preview="message"]') + # if 'See more' or 'Continue reading' is present in post + if Finder._Finder__element_exists(post_content, "span.text_exposed_link > a"): + element = post_content.find_element( + By.CSS_SELECTOR, "span.text_exposed_link > a") # grab that element + # if element have already the onclick function, that means it is expandable paragraph if element.get_attribute("onclick"): - Utilities._Utilities__click_see_more(driver,post_content) #click 'see more' button to get hidden text as well - content = Scraping_utilities._Scraping_utilities__extract_content(post_content) #extract content out of it - elif element.get_attribute("target"): #if element have attribute of target="_blank" - #if it does not have onclick() method, it means we'll to extract passage by request - #if content have attribute target="_blank" it indicates that text will open in new tab, - #so make a seperate request and get that text - content = Finder._Finder__fetch_post_passage(element.get_attribute("href")) + # click 'see more' button to get hidden text as well + Utilities._Utilities__click_see_more(driver, post_content) + content = Scraping_utilities._Scraping_utilities__extract_content( + post_content) # extract content out of it + # if element have attribute of target="_blank" + elif element.get_attribute("target"): + # if it does not have onclick() method, it means we'll to extract passage by request + # if content have attribute target="_blank" it indicates that text will open in new tab, + # so make a seperate request and get that text + content = Finder._Finder__fetch_post_passage( + element.get_attribute("href")) else: - #if it does not have see more, just get the text out of it + # if it does not have see more, just get the text out of it content = post_content.get_attribute("textContent") except NoSuchElementException: - #if [data-testid="post_message"] is not found, it means that post did not had any text,either it is image or video + # if [data-testid="post_message"] is not found, it means that post did not had any text,either it is image or video content = "" except Exception as ex: print("error at find_content method : {}".format(ex)) content = "" return content - @staticmethod - def __find_posted_time(post,layout,link_element): + def __find_posted_time(post, layout, link_element): """finds posted time of the facebook post using selenium's webdriver's method""" try: - #extract element that looks like + # extract element that looks like #posted_time = post.find_element_by_css_selector("abbr._5ptz").get_attribute("data-utime") if layout == "old": - posted_time = post.find_element(By.TAG_NAME,"abbr").get_attribute('data-utime') - return datetime.datetime.fromtimestamp(float(posted_time)).isoformat() + posted_time = post.find_element( + By.TAG_NAME, "abbr").get_attribute('data-utime') + return datetime.datetime.fromtimestamp(float(posted_time)).isoformat() elif layout == "new": - aria_label_value = link_element.get_attribute("aria-label") - timestamp = parse(aria_label_value).isoformat() if len( - aria_label_value) > 5 else Scraping_utilities._Scraping_utilities__convert_to_iso(aria_label_value) - return timestamp + aria_label_value = link_element.get_attribute("aria-label") + timestamp = parse(aria_label_value).isoformat() if len( + aria_label_value) > 5 else Scraping_utilities._Scraping_utilities__convert_to_iso(aria_label_value) + return timestamp except dateutil.parser._parser.ParserError: timestamp = Scraping_utilities._Scraping_utilities__convert_to_iso( aria_label_value) @@ -226,14 +239,14 @@ def __find_posted_time(post,layout,link_element): timestamp = "" return timestamp - @staticmethod - def __find_video_url(post,page_name,status): + def __find_video_url(post, page_name, status): """finds video of the facebook post using selenium's webdriver's method""" try: - #if video is found in the post, than create a video URL by concatenating post's id with page_name - video_element = post.find_element(By.TAG_NAME,"video") - video = "https://www.facebook.com/{}/videos/{}".format(page_name,status) + # if video is found in the post, than create a video URL by concatenating post's id with page_name + video_element = post.find_element(By.TAG_NAME, "video") + video = "https://www.facebook.com/{}/videos/{}".format( + page_name, status) except NoSuchElementException: video = "" @@ -248,10 +261,12 @@ def __find_video_url(post,page_name,status): def __find_image_url(post): """finds all image of the facebook post using selenium's webdriver's method""" try: - #find all img tag that looks like - images = post.find_elements(By.CSS_SELECTOR,"img.scaledImageFitWidth.img") - #extract src attribute from all the img tag,store it in list - sources = [image.get_attribute("src") for image in images] if len(images) > 0 else [] + # find all img tag that looks like + images = post.find_elements( + By.CSS_SELECTOR, "img.scaledImageFitWidth.img") + # extract src attribute from all the img tag,store it in list + sources = [image.get_attribute("src") for image in images] if len( + images) > 0 else [] except NoSuchElementException: sources = [] pass @@ -262,18 +277,20 @@ def __find_image_url(post): return sources @staticmethod - def __find_all_posts(driver,layout): + def __find_all_posts(driver, layout): """finds all posts of the facebook page using selenium's webdriver's method""" try: - #find all posts that looks like
    + # find all posts that looks like
    if layout == "old": - all_posts = driver.find_elements(By.CSS_SELECTOR,"div.userContentWrapper") + all_posts = driver.find_elements( + By.CSS_SELECTOR, "div.userContentWrapper") elif layout == "new": - all_posts = driver.find_elements(By.CSS_SELECTOR,'[aria-posinset]') + all_posts = driver.find_elements( + By.CSS_SELECTOR, '[aria-posinset]') return all_posts except NoSuchElementException: print("Cannot find any posts! Exiting!") - #if this fails to find posts that means, code cannot move forward, as no post is found + # if this fails to find posts that means, code cannot move forward, as no post is found Utilities.__close_driver(driver) sys.exit(1) except Exception as ex: @@ -282,40 +299,41 @@ def __find_all_posts(driver,layout): sys.exit(1) @staticmethod - def __find_name(driver,layout): + def __find_name(driver, layout): """finds name of the facebook page using selenium's webdriver's method""" try: if layout == "old": - name = driver.find_element(By.CSS_SELECTOR,'a._64-f').get_attribute('textContent') + name = driver.find_element( + By.CSS_SELECTOR, 'a._64-f').get_attribute('textContent') elif layout == "new": - name = driver.find_element(By.TAG_NAME,"strong").get_attribute("textContent") + name = driver.find_element( + By.TAG_NAME, "strong").get_attribute("textContent") return name except Exception as ex: print("error at __find_name method : {}".format(ex)) @staticmethod def __detect_ui(driver): - try: - driver.find_element(By.ID,"pagelet_bluebar") - return "old" - except NoSuchElementException: - return "new" - except Exception as ex: - print("error art __detect_ui: {}".format(ex)) - Utilities.__close_driver(driver) - sys.exit(1) + try: + driver.find_element(By.ID, "pagelet_bluebar") + return "old" + except NoSuchElementException: + return "new" + except Exception as ex: + print("error art __detect_ui: {}".format(ex)) + Utilities.__close_driver(driver) + sys.exit(1) @staticmethod def __find_reaction(layout, reactions_all): - try: - if layout == "old": - return reactions_all.find_elements(By.TAG_NAME, - "a") - elif layout == "new": - return reactions_all.find_elements(By.TAG_NAME, - "div") - - except Exception as ex: - print("find_reaction", ex) - return "" + try: + if layout == "old": + return reactions_all.find_elements(By.TAG_NAME, + "a") + elif layout == "new": + return reactions_all.find_elements(By.TAG_NAME, + "div") + except Exception as ex: + print("find_reaction", ex) + return "" diff --git a/facebook_page_scraper/scraper.py b/facebook_page_scraper/scraper.py index a06bc18..c90d5af 100644 --- a/facebook_page_scraper/scraper.py +++ b/facebook_page_scraper/scraper.py @@ -4,7 +4,6 @@ from .driver_utilities import Utilities from .element_finder import Finder from .scraping_utilities import Scraping_utilities - from selenium.common.exceptions import NoSuchElementException import json import csv import os @@ -13,34 +12,33 @@ except Exception as ex: print(ex) + class Facebook_scraper: - __data_dict = {} #this dictionary stores all post's data + __data_dict = {} # this dictionary stores all post's data - #when we scroll and extract all posts,it may happens that we extract same posts over and over,so this lead to too much iteration - #and waste time to iterate over and over the same post, to solve that, + # when we scroll and extract all posts,it may happens that we extract same posts over and over,so this lead to too much iteration + # and waste time to iterate over and over the same post, to solve that, # problem I needed a data structure which # 1) removes duplicates from itself automatically, # 2) provides search of element, # 3) compatible with list's unpacking to quickly add element inside itself from list # set() seems to be doing the work properly - #__extracted_post contains all the post's ID that have been scraped before and as it set() it avoids post's ID duplication. + # __extracted_post contains all the post's ID that have been scraped before and as it set() it avoids post's ID duplication. __extracted_post = set() - - #condition, - #1) if we reach bottom of the page and post is not longer available, and we don't meet the number of posts that we need to find - #2) if we were given wrong page_name, and it does not exists in fb than no post will exist. - #with above condition being true, the crawler will keep on scrolling the page to find posts + # condition, + # 1) if we reach bottom of the page and post is not longer available, and we don't meet the number of posts that we need to find + # 2) if we were given wrong page_name, and it does not exists in fb than no post will exist. + # with above condition being true, the crawler will keep on scrolling the page to find posts # and it will stuck in infinite loop, which may cause machine to crash - #to solve the problem, I have declared a class member "retry",assigned it value 10. - #it checks 'retry' times if posts does not exists. + # to solve the problem, I have declared a class member "retry",assigned it value 10. + # it checks 'retry' times if posts does not exists. # __no_post_found method subtracts -1 every time the if post is not found. - #on each iteration __close_after_retry is called to check if retry have turned to 0 + # on each iteration __close_after_retry is called to check if retry have turned to 0 # if it returns true,it will break the loop. After coming out of loop,driver will be closed and it will return post whatever was found - - def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None,timeout=600): + def __init__(self, page_name, posts_count=10, browser="chrome", proxy=None, timeout=600, headless=True): self.page_name = page_name self.posts_count = int(posts_count) #self.URL = "https://en-gb.facebook.com/pg/{}/posts".format(self.page_name) @@ -50,214 +48,242 @@ def __init__(self,page_name,posts_count=10,browser="chrome",proxy=None,timeout=6 self.proxy = proxy self.__layout = '' self.timeout = timeout + self.headless = headless def __start_driver(self): """changes the class member __driver value to driver on call""" - self.__driver = Initializer(self.browser,self.proxy).init() - def __handle_popup(self,layout): - #while scrolling, wait for login popup to show, it can be skipped by clicking "Not Now" button + self.__driver = Initializer( + self.browser, self.proxy, self.headless).init() + + def __handle_popup(self, layout): + # while scrolling, wait for login popup to show, it can be skipped by clicking "Not Now" button try: - if layout == "old": - #if during scrolling any of error or signup popup shows - Utilities._Utilities__close_error_popup(self.__driver) - Utilities._Utilities__close_popup(self.__driver) - elif layout == "new": - Utilities._Utilities__close_modern_layout_signup_modal(self.__driver) + if layout == "old": + # if during scrolling any of error or signup popup shows + Utilities._Utilities__close_error_popup(self.__driver) + Utilities._Utilities__close_popup(self.__driver) + elif layout == "new": + Utilities._Utilities__close_modern_layout_signup_modal( + self.__driver) except Exception as ex: - print(ex) + print(ex) - def __check_timeout(self,start_time,current_time): - return (current_time-start_time) > self.timeout + def __check_timeout(self, start_time, current_time): + return (current_time-start_time) > self.timeout def scrap_to_json(self): - #call the __start_driver and override class member __driver to webdriver's instance + # call the __start_driver and override class member __driver to webdriver's instance self.__start_driver() starting_time = time.time() - #navigate to URL + # navigate to URL self.__driver.get(self.URL) self.__layout = Finder._Finder__detect_ui(self.__driver) - #sometimes we get popup that says "your request couldn't be processed", however - #posts are loading in background if popup is closed, so call this method in case if it pops up. + # sometimes we get popup that says "your request couldn't be processed", however + # posts are loading in background if popup is closed, so call this method in case if it pops up. Utilities._Utilities__close_error_popup(self.__driver) - #wait for post to load - Utilities._Utilities__wait_for_element_to_appear(self.__driver,self.__layout) - #scroll down to bottom most - Utilities._Utilities__scroll_down(self.__driver,self.__layout) + # wait for post to load + Utilities._Utilities__wait_for_element_to_appear( + self.__driver, self.__layout) + # scroll down to bottom most + Utilities._Utilities__scroll_down(self.__driver, self.__layout) self.__handle_popup(self.__layout) - - name = Finder._Finder__find_name(self.__driver,self.__layout) #find name element + name = Finder._Finder__find_name( + self.__driver, self.__layout) # find name element while len(self.__data_dict) <= self.posts_count: self.__handle_popup(self.__layout) self.__find_elements(name) current_time = time.time() - if self.__check_timeout(starting_time,current_time) is True: - print("Timeout...") - break - Utilities._Utilities__scroll_down(self.__driver, self.__layout) #scroll down - #print(len(self.__data_dict)) - #close the browser window after job is done. + if self.__check_timeout(starting_time, current_time) is True: + print("Timeout...") + break + Utilities._Utilities__scroll_down( + self.__driver, self.__layout) # scroll down + # print(len(self.__data_dict)) + # close the browser window after job is done. Utilities._Utilities__close_driver(self.__driver) - #dict trimming, might happen that we find more posts than it was asked, so just trim it - self.__data_dict = dict(list(self.__data_dict.items())[0:int(self.posts_count)]) + # dict trimming, might happen that we find more posts than it was asked, so just trim it + self.__data_dict = dict(list(self.__data_dict.items())[ + 0:int(self.posts_count)]) - return json.dumps(self.__data_dict,ensure_ascii=False) + return json.dumps(self.__data_dict, ensure_ascii=False) - def __json_to_csv(self,filename,json_data,directory): + def __json_to_csv(self, filename, json_data, directory): - os.chdir(directory) #change working directory to given directory - #headers of the CSV file - fieldnames = ['id','name','shares','likes','loves','wow','cares','sad','angry','haha','reactions_count','comments', - 'content','posted_on','video' - ,'image','post_url'] - #open and start writing to CSV files - with open("{}.csv".format(filename),'w',newline='',encoding="utf-8") as data_file: - writer = csv.DictWriter(data_file,fieldnames=fieldnames) #instantiate DictWriter for writing CSV file + os.chdir(directory) # change working directory to given directory + # headers of the CSV file + fieldnames = ['id', 'name', 'shares', 'likes', 'loves', 'wow', 'cares', 'sad', 'angry', 'haha', 'reactions_count', 'comments', + 'content', 'posted_on', 'video', 'image', 'post_url'] + # open and start writing to CSV files + with open("{}.csv".format(filename), 'w', newline='', encoding="utf-8") as data_file: + # instantiate DictWriter for writing CSV file + writer = csv.DictWriter(data_file, fieldnames=fieldnames) - writer.writeheader() #write headers to CSV file - #iterate over entire dictionary, write each posts as a row to CSV file + writer.writeheader() # write headers to CSV file + # iterate over entire dictionary, write each posts as a row to CSV file for key in json_data: - #parse post in a dictionary and write it as a single row - row = {'id': key,'name' : json_data[key]['name'],'shares':json_data[key]['shares'], - 'likes' : json_data[key]['reactions']['likes'],'loves' : json_data[key]['reactions']['loves'], - 'wow' : json_data[key]['reactions']['wow'],'cares' : json_data[key]['reactions']['cares'], - 'sad' : json_data[key]['reactions']['sad'],'angry' : json_data[key]['reactions']['angry'], - 'haha' : json_data[key]['reactions']['haha'],'reactions_count' : json_data[key]['reaction_count'], - 'comments' : json_data[key]['comments'],'content' : json_data[key]['content'],'posted_on' : json_data[key]['posted_on'], - 'video' : json_data[key]['video'],'image': " ".join(json_data[key]['image']) - ,'post_url' : json_data[key]['post_url'] - } - writer.writerow(row) #write row to CSV file - - data_file.close() #after writing close the file - - def scrap_to_csv(self,filename,directory = os.getcwd()): + # parse post in a dictionary and write it as a single row + row = {'id': key, 'name': json_data[key]['name'], 'shares': json_data[key]['shares'], + 'likes': json_data[key]['reactions']['likes'], 'loves': json_data[key]['reactions']['loves'], + 'wow': json_data[key]['reactions']['wow'], 'cares': json_data[key]['reactions']['cares'], + 'sad': json_data[key]['reactions']['sad'], 'angry': json_data[key]['reactions']['angry'], + 'haha': json_data[key]['reactions']['haha'], 'reactions_count': json_data[key]['reaction_count'], + 'comments': json_data[key]['comments'], 'content': json_data[key]['content'], 'posted_on': json_data[key]['posted_on'], + 'video': json_data[key]['video'], 'image': " ".join(json_data[key]['image']), 'post_url': json_data[key]['post_url'] + } + writer.writerow(row) # write row to CSV file + + data_file.close() # after writing close the file + + def scrap_to_csv(self, filename, directory=os.getcwd()): try: - data = self.scrap_to_json() #get the data in JSON format from the same class method - self.__json_to_csv(filename,json.loads(data),directory) #convert it and write to CSV + data = self.scrap_to_json() # get the data in JSON format from the same class method + # convert it and write to CSV + self.__json_to_csv(filename, json.loads(data), directory) return True except Exception as ex: print(ex) return False - def __remove_duplicates(self,all_posts): + def __remove_duplicates(self, all_posts): """takes a list of posts and removes duplicates from it and returns the list""" - if len(self.__extracted_post) == 0: #if self.__extracted_post is empty that means it is first extraction - self.__extracted_post.update(all_posts) #if it does than just add all the elements from the lists to __extracted_post set() - return all_posts #return the all_posts without any changes as it is first time and no duplicate is present + if len(self.__extracted_post) == 0: # if self.__extracted_post is empty that means it is first extraction + # if it does than just add all the elements from the lists to __extracted_post set() + self.__extracted_post.update(all_posts) + return all_posts # return the all_posts without any changes as it is first time and no duplicate is present else: - #if self.extracted posts have some element than compare it with all_posts's element and return a new list containing new element - removed_duplicated = [post for post in all_posts if post not in self.__extracted_post] - self.__extracted_post.update(all_posts) #after removing duplicates, add all those new element to extracted_posts, as it - return removed_duplicated #is set() it won't have duplicate elements + # if self.extracted posts have some element than compare it with all_posts's element and return a new list containing new element + removed_duplicated = [ + post for post in all_posts if post not in self.__extracted_post] + # after removing duplicates, add all those new element to extracted_posts, as it + self.__extracted_post.update(all_posts) + return removed_duplicated # is set() it won't have duplicate elements def __close_after_retry(self): """returns if class member retry is 0""" return self.retry <= 0 - - def __no_post_found(self,all_posts): + def __no_post_found(self, all_posts): """if all_posts were found to be length of 0""" if len(all_posts) == 0: - #if length of posts is 0,decrement retry by 1 + # if length of posts is 0,decrement retry by 1 self.retry -= 1 - def __find_elements(self,name): + def __find_elements(self, name): """find elements of posts and add them to data_dict""" - all_posts = Finder._Finder__find_all_posts(self.__driver,self.__layout) #find all posts - all_posts = self.__remove_duplicates(all_posts) #remove duplicates from the list + all_posts = Finder._Finder__find_all_posts( + self.__driver, self.__layout) # find all posts + all_posts = self.__remove_duplicates( + all_posts) # remove duplicates from the list - #iterate over all the posts and find details from the same + # iterate over all the posts and find details from the same for post in all_posts: try: - #find post ID from post - status,post_url,link_element = Finder._Finder__find_status(post,self.__layout) - #find share from the post - shares = Finder._Finder__find_share(post,self.__layout) - #converting shares to number - #e.g if 5k than it should be 5000 - shares = int(Scraping_utilities._Scraping_utilities__value_to_float(shares)) - #find all reactions + # find post ID from post + status, post_url, link_element = Finder._Finder__find_status( + post, self.__layout) + # find share from the post + shares = Finder._Finder__find_share(post, self.__layout) + # converting shares to number + # e.g if 5k than it should be 5000 + shares = int( + Scraping_utilities._Scraping_utilities__value_to_float(shares)) + # find all reactions reactions_all = Finder._Finder__find_reactions(post) - #find all anchor tags in reactions_all list - all_hrefs_in_react = Finder._Finder__find_reaction(self.__layout,reactions_all,) if type( + # find all anchor tags in reactions_all list + all_hrefs_in_react = Finder._Finder__find_reaction(self.__layout, reactions_all,) if type( reactions_all) != str else "" - #if hrefs were found - #all_hrefs contains elements like - #["5 comments","54 Likes"] and so on + # if hrefs were found + # all_hrefs contains elements like + # ["5 comments","54 Likes"] and so on if type(all_hrefs_in_react) == list: - l = [i.get_attribute("aria-label") for i in all_hrefs_in_react] + l = [i.get_attribute("aria-label") + for i in all_hrefs_in_react] else: l = [] - #extract that aria-label from all_hrefs_in_react list and than extract number from them seperately - #if Like aria-label is in the list, than extract it and extract numbers from that text + # extract that aria-label from all_hrefs_in_react list and than extract number from them seperately + # if Like aria-label is in the list, than extract it and extract numbers from that text - likes = Scraping_utilities._Scraping_utilities__find_reaction_by_text(l,"Like") + likes = Scraping_utilities._Scraping_utilities__find_reaction_by_text( + l, "Like") - #if Love aria-label is in the list, than extract it and extract numbers from that text + # if Love aria-label is in the list, than extract it and extract numbers from that text loves = Scraping_utilities._Scraping_utilities__find_reaction_by_text( l, "Love") - #if Wow aria-label is in the list, than extract it and extract numbers from that text + # if Wow aria-label is in the list, than extract it and extract numbers from that text wow = Scraping_utilities._Scraping_utilities__find_reaction_by_text( l, "Wow") - #if Care aria-label is in the list, than extract it and extract numbers from that text - cares = Scraping_utilities._Scraping_utilities__find_reaction_by_text(l,"Care") - #if Sad aria-label is in the list, than extract it and extract numbers from that text - sad = Scraping_utilities._Scraping_utilities__find_reaction_by_text(l,"Sad") - #if Angry aria-label is in the list, than extract it and extract numbers from that text - angry = Scraping_utilities._Scraping_utilities__find_reaction_by_text(l,"Angry") - #if Haha aria-label is in the list, than extract it and extract numbers from that text + # if Care aria-label is in the list, than extract it and extract numbers from that text + cares = Scraping_utilities._Scraping_utilities__find_reaction_by_text( + l, "Care") + # if Sad aria-label is in the list, than extract it and extract numbers from that text + sad = Scraping_utilities._Scraping_utilities__find_reaction_by_text( + l, "Sad") + # if Angry aria-label is in the list, than extract it and extract numbers from that text + angry = Scraping_utilities._Scraping_utilities__find_reaction_by_text( + l, "Angry") + # if Haha aria-label is in the list, than extract it and extract numbers from that text haha = Scraping_utilities._Scraping_utilities__find_reaction_by_text( l, "Haha") - #converting all reactions to numbers - #e,g reactions may contain counts like "5k","5m", so converting them to actual number - likes = Scraping_utilities._Scraping_utilities__value_to_float(likes) - loves = Scraping_utilities._Scraping_utilities__value_to_float(loves) - wow = Scraping_utilities._Scraping_utilities__value_to_float(wow) - cares = Scraping_utilities._Scraping_utilities__value_to_float(cares) - sad = Scraping_utilities._Scraping_utilities__value_to_float(sad) - angry = Scraping_utilities._Scraping_utilities__value_to_float(angry) - haha = Scraping_utilities._Scraping_utilities__value_to_float(haha) - - reactions = {"likes" : int(likes),"loves" : int(loves),"wow":int(wow),"cares" : int(cares),"sad":int(sad), - "angry": - int(angry),"haha" : int(haha)} - - #count number of total reactions - total_reaction_count = Scraping_utilities._Scraping_utilities__count_reaction(reactions) - - comments = Finder._Finder__find_comments(post,self.__layout) - comments = int(Scraping_utilities._Scraping_utilities__value_to_float(comments)) - post_content = Finder._Finder__find_content(post,self.__driver,self.__layout) - #extract time - posted_time = Finder._Finder__find_posted_time(post,self.__layout,link_element) - - video = Finder._Finder__find_video_url(post,self.page_name,status) + # converting all reactions to numbers + # e,g reactions may contain counts like "5k","5m", so converting them to actual number + likes = Scraping_utilities._Scraping_utilities__value_to_float( + likes) + loves = Scraping_utilities._Scraping_utilities__value_to_float( + loves) + wow = Scraping_utilities._Scraping_utilities__value_to_float( + wow) + cares = Scraping_utilities._Scraping_utilities__value_to_float( + cares) + sad = Scraping_utilities._Scraping_utilities__value_to_float( + sad) + angry = Scraping_utilities._Scraping_utilities__value_to_float( + angry) + haha = Scraping_utilities._Scraping_utilities__value_to_float( + haha) + + reactions = {"likes": int(likes), "loves": int(loves), "wow": int(wow), "cares": int(cares), "sad": int(sad), + "angry": + int(angry), "haha": int(haha)} + + # count number of total reactions + total_reaction_count = Scraping_utilities._Scraping_utilities__count_reaction( + reactions) + + comments = Finder._Finder__find_comments(post, self.__layout) + comments = int( + Scraping_utilities._Scraping_utilities__value_to_float(comments)) + post_content = Finder._Finder__find_content( + post, self.__driver, self.__layout) + # extract time + posted_time = Finder._Finder__find_posted_time( + post, self.__layout, link_element) + + video = Finder._Finder__find_video_url( + post, self.page_name, status) image = Finder._Finder__find_image_url(post) #post_url = "https://www.facebook.com/{}/posts/{}".format(self.page_name,status) self.__data_dict[status] = { - "name" : name, - "shares" : shares, - "reactions" : reactions, - "reaction_count" : total_reaction_count, - "comments" : comments, - "content" : post_content, - "posted_on" : posted_time, - "video" : video, - "image" : image, - "post_url" :post_url + "name": name, + "shares": shares, + "reactions": reactions, + "reaction_count": total_reaction_count, + "comments": comments, + "content": post_content, + "posted_on": posted_time, + "video": video, + "image": image, + "post_url": post_url } except Exception as ex: print("error at find_elements method : {}".format(ex)) - - diff --git a/facebook_page_scraper/scraping_utilities.py b/facebook_page_scraper/scraping_utilities.py index 08c4da2..f388f06 100644 --- a/facebook_page_scraper/scraping_utilities.py +++ b/facebook_page_scraper/scraping_utilities.py @@ -3,9 +3,11 @@ from datetime import datetime as dt import re from datetime import datetime, timedelta + from selenium.webdriver.common.by import By except Exception as ex: print(ex) + class Scraping_utilities: @staticmethod def __extract_numbers(string): @@ -13,14 +15,13 @@ def __extract_numbers(string): e.g => input = '54454 comment', than output => 54454 """ try: - #return string.split(" ")[0] - return re.findall("\d+",string)[0] + # return string.split(" ")[0] + return re.findall("\d+", string)[0] except IndexError: return 0 - @staticmethod - def __exists_in_list(li,word): + def __exists_in_list(li, word): """expects list and a element, returns all the occurence of element in the list. e.g input => li = ['sajid','sajid','sajid','d','s'] with given word = 'sajid', output => ['sajid','sajid','sajid'] """ @@ -37,7 +38,7 @@ def __convert_time(unix_timestamp): def __extract_content(content): """returns the text content of selenium element, else if content is string than returns a empty string""" if type(content) is not str: - all_para = content.find_elements_by_tag_name("p") + all_para = content.find_elements(By.TAG_NAME, "p") paragraph = '' for para in all_para: paragraph += para.get_attribute("textContent") @@ -59,13 +60,13 @@ def __extract_id_from_link(link): """expects the post's URL as a argument, and extracts out post_id from that URL""" try: status = "NA" - #if url pattern container "/posts" + # if url pattern container "/posts" if "posts/" in link: status = link.split('/')[5].split('?')[0] - #if url pattern container "/photos" + # if url pattern container "/photos" elif "photos/" in link: status = link.split("/")[-2] - #if url pattern container "/videos" + # if url pattern container "/videos" if "/videos/" in link: status = link.split("/")[5] elif "fbid=" in link: @@ -100,10 +101,10 @@ def __value_to_float(x): @staticmethod def __find_reaction_by_text(l, string): - reaction = [substring for substring in l if string in substring] - reaction = re.findall( - "\d+", reaction[0])[0] if len(reaction) > 0 else "0" - return reaction + reaction = [substring for substring in l if string in substring] + reaction = re.findall( + "\d+", reaction[0])[0] if len(reaction) > 0 else "0" + return reaction @staticmethod def __convert_to_iso(t): diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 2c84015..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -selenium==4.1.0 -webdriver-manager==3.2.2 -selenium-wire==4.3.1 -python-dateutil==2.8.2 \ No newline at end of file diff --git a/setup.py b/setup.py index fd39f6d..8e29f4d 100644 --- a/setup.py +++ b/setup.py @@ -1,26 +1,27 @@ import setuptools -with open("README.md","r") as file: +with open("README.md", "r") as file: long_description = file.read() -requirements = [] +requirements = ['selenium==4.1.0', + 'webdriver-manager==3.2.2', + 'selenium-wire==4.3.1', + 'python-dateutil==2.8.2'] -for line in open("requirements.txt",'r',encoding="utf-8").readlines(): - requirements.append(line.replace("\n","")) setuptools.setup( - name = "facebook_page_scraper", - version = "2.0.2", - author = "Sajid Shaikh", - author_email = "shaikhsajid3732@gmail.com", - description = "Python package to scrap facebook's pages front end with no limitations", - long_description = long_description, - long_description_content_type = "text/markdown", + name="facebook_page_scraper", + version="3.0.0", + author="Sajid Shaikh", + author_email="shaikhsajid3732@gmail.com", + description="Python package to scrap facebook's pages front end with no limitations", + long_description=long_description, + long_description_content_type="text/markdown", license="MIT", - url = "https://github.com/shaikhsajid1111/facebook_page_scraper", - keywords = "web-scraping selenium facebook facebook-pages", - packages = setuptools.find_packages(), - classifiers = [ + url="https://github.com/shaikhsajid1111/facebook_page_scraper", + keywords="web-scraping selenium facebook facebook-pages", + packages=setuptools.find_packages(), + classifiers=[ "Development Status :: 4 - Beta", "Environment :: Console", "Intended Audience :: Developers", @@ -40,6 +41,6 @@ "Topic :: Internet :: WWW/HTTP" ], - python_requires = ">=3.6", + python_requires=">=3.6", install_requires=requirements )