ejercicios/utilities/news_summarize.py

142 lines
3.9 KiB
Python

import os
import re
import smtplib
from collections import Counter
from email.utils import formatdate
import requests
from bs4 import BeautifulSoup
def fetch_latest_news(article_position):
url = "https://www.genbeta.com/categoria/inteligencia-artificial"
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Error fetching the URL. Code: {response.status_code}")
page = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
div_recent_list = page.find("div", class_="section-recent-list")
if not div_recent_list:
raise Exception("Doesn't find any recent list")
article = div_recent_list.find_all("article")[article_position]
first_news = article.find("a", href=True)
if not first_news:
raise Exception("Doesn't find any link.")
news_title = first_news.get_text(strip=True)
news_link = first_news["href"]
if not news_link.startswith("http"):
news_link = "https://www.genbeta.com" + news_link
return news_title, news_link
def fetch_article(url):
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Error fetching the article. Code: {response.status_code}")
page = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
article_body = page.find("div", {"class": "article-content"})
if not article_body:
raise Exception("Is not possible to find content for this artícle.")
paragraphs = article_body.find_all("p")
content = " ".join([p.get_text(strip=True) for p in paragraphs])
return content
def summarize_article(text, sentences_count=3):
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
sentences = [sentence for sentence in sentences if len(sentence.split()) > 5]
words = re.findall(r"\w+", text.lower())
word_frequencies = Counter(words)
sentence_scores = {
sentence: sum(word_frequencies.get(word.lower(), 0) for word in sentence.split())
for sentence in sentences
}
summarized_sentences = sorted(
sentence_scores.keys(),
key=lambda sentence: sentence_scores[sentence],
reverse=True
)[:sentences_count]
ordered_summary = sorted(
summarized_sentences,
key=lambda sentence: sentences.index(sentence)
)
return " ".join(ordered_summary)
def send_email_with_summaries(message_body):
email_from = os.getenv("EMAIL_FROM")
email_to = os.getenv("EMAIL_TO")
password = os.getenv("EMAIL_PASSWORD")
subject = f"News summary: {formatdate(localtime=True)}"
message = f"Subject: {subject}\n\n{message_body}".encode("utf-8")
with smtplib.SMTP("smtp.gmail.com", 587) as server:
server.starttls()
server.login(email_from, password)
result = server.sendmail(email_from, email_to, message)
server.quit()
if result:
raise Exception("The email fail to send it")
def main():
try:
summaries = ""
for i in range(0, 3):
news_title, news_link = fetch_latest_news(i)
summaries += f"New find: {news_title} ({news_link})"
print(f"New find: {news_title} ({news_link})")
content = fetch_article(news_link)
if content:
summary = summarize_article(content)
summaries += "\n=== Resume ===\n"
print("\n=== Resume ===\n")
lines = summary.split(". ")
for line in lines:
summaries += f"- {line.strip()}.\n"
print(f"- {line.strip()}.")
summaries += "\n" + "=" * 20 + "\n"
print("\n" + "=" * 20 + "\n")
else:
print("Is not possible to retrieve content for this new.")
# send_email_with_summaries(summaries)
print("Success!")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
main()