Added scrapper, still fixing issues, just a save in between

This commit is contained in:
Dennis Eckerskorn 2024-12-08 17:40:46 +01:00
parent add737720e
commit a645184cff
5 changed files with 108 additions and 1 deletions

89
src/services/scrapper.py Normal file
View File

@ -0,0 +1,89 @@
import threading
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import mysql.connector
class Scrapper:
def __init__(self, ui_instance):
self.ui_instance = ui_instance
self.visited_links = set()
self.running=False
self.lock() = threading.Lock()
#Configurar la base de datos para los enlaces
self.db.config = {
"host": "localhost",
"user": "root",
"password": "1234Scrap",
"database": "scrap_links_db"
}
def start_scraping(self):
"""Inicia el proceso de scraping"""
self.running = True
url = self.get_url_from_ui()
if url:
self.scrape_page(url)
def stop_scraping(self):
"""Detiene el proceso de scraping"""
self.running = False
def scrape_page(self, url):
"""Scrapea una web y busca los enlaces"""
if not self.running or url in self.visited_links:
return
with self.lock:
self.visited_links.add(url)
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
links = [urljoin(url, a.get("href")) for a in soup.find_all("a", href=True)]
self.update_ui(url, links)
self.save_links_to_db(url, links)
for link in links:
if self.running:
threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start()
else:
print(f"Error al acceder a {url}: {response.status_code}")
except Exception as e:
print(f"Error al scrapear {url}: {e}")
def update_ui(self, url, links):
"""Actualiza la pestaña 'Scrapping' con los enlaces encontrados"""
tab = self.ui_instance.tabs["Scrapping"]
text_widget = tab.text_widget
text_widget.insert("end", f"Enlaces encontrados en {url}:\n")
for link in links:
text_widget.insert("end", f" - {link}\n")
text_widget.insert("end", "\n")
text_widget.see("end")
def save_links_to_db(self, url, links)
"""Guarda los enlaces en la base de datos"""
try:
connection = mysql.connector(**self.db_config)
cursor = connection.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS links (id INT AUTO_INCREMENT PRIMARY KEY, url TEXT, parent_url TEXT)")
for link in links:
cursor.execute("INSERT INTO links (url, parent_url) VALUES (%s, %s)", (link, url))
connection.commit()
cursor.close()
except:
print(f"Error al gaurdar en la base de datos")
def get_url_from_ui(self):
"""Obtiene la URL desde la interfaz de usuario"""
try:
url_entry = self.ui_instance.left_panel.url_entry
return url_entry.get()
except AttributeError:
print("No se pudo obtener la URL desde la interfaz")
return None

View File

@ -6,6 +6,7 @@ import random
from services.threaden_task import ThreadenTask from services.threaden_task import ThreadenTask
from services.system_monitor import SystemMonitor from services.system_monitor import SystemMonitor
from services.tetris_game import TetrisGame from services.tetris_game import TetrisGame
from services.scrapper import Scrapper
class ThreadsManager: class ThreadsManager:
"""Constructor""" """Constructor"""
@ -17,8 +18,10 @@ class ThreadsManager:
"temperature": ThreadenTask(), "temperature": ThreadenTask(),
"emails":ThreadenTask(), "emails":ThreadenTask(),
"tetris_game":ThreadenTask(), "tetris_game":ThreadenTask(),
"scrapper":ThreadenTask(),
} }
self.system_monitor_tasks = {} self.system_monitor_tasks = {}
self.scrapper = Scrapper(ui_instance)
@ -35,6 +38,7 @@ class ThreadsManager:
self.tasks["time"].start(self.update_time) self.tasks["time"].start(self.update_time)
self.tasks["temperature"].start(self.update_temperature) self.tasks["temperature"].start(self.update_temperature)
self.tasks["emails"].start(self.update_emails) self.tasks["emails"].start(self.update_emails)
self.tasks["scrapper"].start(self.scrapper.start_scraping)
if self.system_monitor: if self.system_monitor:
for metric in self.system_monitor.metrics.keys(): for metric in self.system_monitor.metrics.keys():

View File

@ -90,6 +90,20 @@ class CenteredWindow(ctk.CTk):
btn = ctk.CTkButton(left_panel, text=text, command=command, width=150) btn = ctk.CTkButton(left_panel, text=text, command=command, width=150)
btn.pack(pady=5, padx=10) btn.pack(pady=5, padx=10)
scrapping_label = ctk.CTkLabel(left_panel, text="Scrapping", font=("Arial", 12, "bold"))
scrapping_label.pack(anchor=ctk.W, pady=5, padx=10)
url_entry = ctk.CTkEntry(left_panel, placeholder_text="Introduce la URL")
url_entry.pack(pady=5, padx=10)
self.left_panel = left_panel
self.left_panel.url_entry = url_entry
start_button = ctk.CTkButton(left_panel, text="Iniciar Scrapping", command=lambda:
self.thread_manager.tasks["scrapper"].start(self.thread_manager.scrapper.start_scraping))
start_button.pack(pady=5, padx=10)
stop_button = ctk.CTkButton(left_panel, text="Detener Scrapping", command=self.thread_manager.tasks["scrapper"].stop)
stop_button.pack("pady=5, padx=10")
def create_center_panel(self): def create_center_panel(self):
@ -101,7 +115,7 @@ class CenteredWindow(ctk.CTk):
tab_view.pack(fill=ctk.BOTH, expand=True) tab_view.pack(fill=ctk.BOTH, expand=True)
# Crear pestañas y manejar contenido por separado # Crear pestañas y manejar contenido por separado
for tab_name in ["Resultados Scrapping", "Navegador", "Correos", "Juego", "Sistema"]: for tab_name in ["Scrapping", "Navegador", "Correos", "Juego", "Sistema"]:
tab = tab_view.add(tab_name) tab = tab_view.add(tab_name)
if tab_name == "Sistema": if tab_name == "Sistema":