mirror of https://github.com/JavMB/ejercicios
142 lines
3.9 KiB
Python
142 lines
3.9 KiB
Python
import os
|
|
import re
|
|
import smtplib
|
|
from collections import Counter
|
|
from email.utils import formatdate
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def fetch_latest_news(article_position):
|
|
url = "https://www.genbeta.com/categoria/inteligencia-artificial"
|
|
|
|
response = requests.get(url)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"Error fetching the URL. Code: {response.status_code}")
|
|
|
|
page = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
|
|
|
|
div_recent_list = page.find("div", class_="section-recent-list")
|
|
|
|
if not div_recent_list:
|
|
raise Exception("Doesn't find any recent list")
|
|
|
|
article = div_recent_list.find_all("article")[article_position]
|
|
|
|
first_news = article.find("a", href=True)
|
|
|
|
if not first_news:
|
|
raise Exception("Doesn't find any link.")
|
|
|
|
news_title = first_news.get_text(strip=True)
|
|
|
|
news_link = first_news["href"]
|
|
|
|
if not news_link.startswith("http"):
|
|
news_link = "https://www.genbeta.com" + news_link
|
|
|
|
return news_title, news_link
|
|
|
|
|
|
def fetch_article(url):
|
|
response = requests.get(url)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"Error fetching the article. Code: {response.status_code}")
|
|
|
|
page = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
|
|
|
|
article_body = page.find("div", {"class": "article-content"})
|
|
|
|
if not article_body:
|
|
raise Exception("Is not possible to find content for this artícle.")
|
|
|
|
paragraphs = article_body.find_all("p")
|
|
|
|
content = " ".join([p.get_text(strip=True) for p in paragraphs])
|
|
|
|
return content
|
|
|
|
|
|
def summarize_article(text, sentences_count=3):
|
|
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
|
|
|
|
sentences = [sentence for sentence in sentences if len(sentence.split()) > 5]
|
|
|
|
words = re.findall(r"\w+", text.lower())
|
|
word_frequencies = Counter(words)
|
|
|
|
sentence_scores = {
|
|
sentence: sum(word_frequencies.get(word.lower(), 0) for word in sentence.split())
|
|
for sentence in sentences
|
|
}
|
|
|
|
summarized_sentences = sorted(
|
|
sentence_scores.keys(),
|
|
key=lambda sentence: sentence_scores[sentence],
|
|
reverse=True
|
|
)[:sentences_count]
|
|
|
|
ordered_summary = sorted(
|
|
summarized_sentences,
|
|
key=lambda sentence: sentences.index(sentence)
|
|
)
|
|
|
|
return " ".join(ordered_summary)
|
|
|
|
|
|
def send_email_with_summaries(message_body):
|
|
email_from = os.getenv("EMAIL_FROM")
|
|
email_to = os.getenv("EMAIL_TO")
|
|
password = os.getenv("EMAIL_PASSWORD")
|
|
|
|
subject = f"News summary: {formatdate(localtime=True)}"
|
|
|
|
message = f"Subject: {subject}\n\n{message_body}".encode("utf-8")
|
|
|
|
with smtplib.SMTP("smtp.gmail.com", 587) as server:
|
|
server.starttls()
|
|
server.login(email_from, password)
|
|
result = server.sendmail(email_from, email_to, message)
|
|
server.quit()
|
|
|
|
if result:
|
|
raise Exception("The email fail to send it")
|
|
|
|
|
|
def main():
|
|
try:
|
|
summaries = ""
|
|
for i in range(0, 3):
|
|
|
|
news_title, news_link = fetch_latest_news(i)
|
|
summaries += f"New find: {news_title} ({news_link})"
|
|
print(f"New find: {news_title} ({news_link})")
|
|
|
|
content = fetch_article(news_link)
|
|
|
|
if content:
|
|
summary = summarize_article(content)
|
|
summaries += "\n=== Resume ===\n"
|
|
print("\n=== Resume ===\n")
|
|
lines = summary.split(". ")
|
|
for line in lines:
|
|
summaries += f"- {line.strip()}.\n"
|
|
print(f"- {line.strip()}.")
|
|
|
|
summaries += "\n" + "=" * 20 + "\n"
|
|
print("\n" + "=" * 20 + "\n")
|
|
else:
|
|
print("Is not possible to retrieve content for this new.")
|
|
|
|
# send_email_with_summaries(summaries)
|
|
print("Success!")
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|