Skip to content

Commit

Permalink
Refactor article parsing logic in fetch_and_parse_articles
Browse files Browse the repository at this point in the history
  • Loading branch information
Cdaprod committed Mar 14, 2024
1 parent ec73db4 commit 93ac795
Showing 1 changed file with 22 additions and 3 deletions.
25 changes: 22 additions & 3 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,25 @@
import pytz
import os

# def fetch_and_parse_articles():
# url = 'https://blog.min.io/author/david-cannan'
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')

# articles = []
# for article in soup.select('article.post-card'):
# title = article.find('h2').text.strip()
# author = 'David Cannan'
# summary = article.select_one('div.post__content > p').text.strip() if article.select_one('div.post__content > p') else ''
# date = article.find('time').text.strip() if article.find('time') else ''

# title_link = article.find('h2').find('a')
# link = title_link['href'] if title_link else ''

# articles.append((title, author, summary, date, link))

# return pd.DataFrame(articles, columns=['title', 'author', 'summary', 'date', 'url'])

def fetch_and_parse_articles():
url = 'https://blog.min.io/author/david-cannan'
response = requests.get(url)
Expand All @@ -17,13 +36,13 @@ def fetch_and_parse_articles():
summary = article.select_one('div.post__content > p').text.strip() if article.select_one('div.post__content > p') else ''
date = article.find('time').text.strip() if article.find('time') else ''

title_link = article.find('h2').find('a')
link = title_link['href'] if title_link else ''
article_link = article.select_one('a.post__more')
link = article_link['href'] if article_link else ''

articles.append((title, author, summary, date, link))

return pd.DataFrame(articles, columns=['title', 'author', 'summary', 'date', 'url'])

def extract_article_content(url):
try:
response = requests.get(url)
Expand Down

0 comments on commit 93ac795

Please sign in to comment.