diff --git a/src/services/__pycache__/threads_manager.cpython-313.pyc b/src/services/__pycache__/threads_manager.cpython-313.pyc index 1de9aa9..394c7df 100644 Binary files a/src/services/__pycache__/threads_manager.cpython-313.pyc and b/src/services/__pycache__/threads_manager.cpython-313.pyc differ diff --git a/src/services/scrapper.py b/src/services/scrapper.py new file mode 100644 index 0000000..6ab0456 --- /dev/null +++ b/src/services/scrapper.py @@ -0,0 +1,89 @@ +import threading +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin +import mysql.connector + +class Scrapper: + def __init__(self, ui_instance): + self.ui_instance = ui_instance + self.visited_links = set() + self.running=False + self.lock() = threading.Lock() + + + #Configurar la base de datos para los enlaces + self.db.config = { + "host": "localhost", + "user": "root", + "password": "1234Scrap", + "database": "scrap_links_db" + } + + def start_scraping(self): + """Inicia el proceso de scraping""" + self.running = True + url = self.get_url_from_ui() + if url: + self.scrape_page(url) + + def stop_scraping(self): + """Detiene el proceso de scraping""" + self.running = False + + def scrape_page(self, url): + """Scrapea una web y busca los enlaces""" + if not self.running or url in self.visited_links: + return + + with self.lock: + self.visited_links.add(url) + + try: + response = requests.get(url, timeout=10) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "html.parser") + links = [urljoin(url, a.get("href")) for a in soup.find_all("a", href=True)] + self.update_ui(url, links) + self.save_links_to_db(url, links) + + for link in links: + if self.running: + threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start() + else: + print(f"Error al acceder a {url}: {response.status_code}") + except Exception as e: + print(f"Error al scrapear {url}: {e}") + + def update_ui(self, url, links): + """Actualiza la pestaña 'Scrapping' con los enlaces encontrados""" + tab = self.ui_instance.tabs["Scrapping"] + text_widget = tab.text_widget + text_widget.insert("end", f"Enlaces encontrados en {url}:\n") + for link in links: + text_widget.insert("end", f" - {link}\n") + text_widget.insert("end", "\n") + text_widget.see("end") + + def save_links_to_db(self, url, links) + """Guarda los enlaces en la base de datos""" + try: + connection = mysql.connector(**self.db_config) + cursor = connection.cursor() + cursor.execute("CREATE TABLE IF NOT EXISTS links (id INT AUTO_INCREMENT PRIMARY KEY, url TEXT, parent_url TEXT)") + for link in links: + cursor.execute("INSERT INTO links (url, parent_url) VALUES (%s, %s)", (link, url)) + connection.commit() + cursor.close() + except: + print(f"Error al gaurdar en la base de datos") + + def get_url_from_ui(self): + """Obtiene la URL desde la interfaz de usuario""" + try: + url_entry = self.ui_instance.left_panel.url_entry + return url_entry.get() + except AttributeError: + print("No se pudo obtener la URL desde la interfaz") + return None + \ No newline at end of file diff --git a/src/services/threads_manager.py b/src/services/threads_manager.py index 4c13bec..1e0fed9 100644 --- a/src/services/threads_manager.py +++ b/src/services/threads_manager.py @@ -6,6 +6,7 @@ import random from services.threaden_task import ThreadenTask from services.system_monitor import SystemMonitor from services.tetris_game import TetrisGame +from services.scrapper import Scrapper class ThreadsManager: """Constructor""" @@ -17,8 +18,10 @@ class ThreadsManager: "temperature": ThreadenTask(), "emails":ThreadenTask(), "tetris_game":ThreadenTask(), + "scrapper":ThreadenTask(), } self.system_monitor_tasks = {} + self.scrapper = Scrapper(ui_instance) @@ -35,6 +38,7 @@ class ThreadsManager: self.tasks["time"].start(self.update_time) self.tasks["temperature"].start(self.update_temperature) self.tasks["emails"].start(self.update_emails) + self.tasks["scrapper"].start(self.scrapper.start_scraping) if self.system_monitor: for metric in self.system_monitor.metrics.keys(): diff --git a/src/ui/__pycache__/centered_window.cpython-313.pyc b/src/ui/__pycache__/centered_window.cpython-313.pyc index 0724c48..8b00711 100644 Binary files a/src/ui/__pycache__/centered_window.cpython-313.pyc and b/src/ui/__pycache__/centered_window.cpython-313.pyc differ diff --git a/src/ui/centered_window.py b/src/ui/centered_window.py index 1bb970d..567df02 100644 --- a/src/ui/centered_window.py +++ b/src/ui/centered_window.py @@ -90,6 +90,20 @@ class CenteredWindow(ctk.CTk): btn = ctk.CTkButton(left_panel, text=text, command=command, width=150) btn.pack(pady=5, padx=10) + scrapping_label = ctk.CTkLabel(left_panel, text="Scrapping", font=("Arial", 12, "bold")) + scrapping_label.pack(anchor=ctk.W, pady=5, padx=10) + url_entry = ctk.CTkEntry(left_panel, placeholder_text="Introduce la URL") + url_entry.pack(pady=5, padx=10) + + self.left_panel = left_panel + self.left_panel.url_entry = url_entry + start_button = ctk.CTkButton(left_panel, text="Iniciar Scrapping", command=lambda: + self.thread_manager.tasks["scrapper"].start(self.thread_manager.scrapper.start_scraping)) + start_button.pack(pady=5, padx=10) + + stop_button = ctk.CTkButton(left_panel, text="Detener Scrapping", command=self.thread_manager.tasks["scrapper"].stop) + stop_button.pack("pady=5, padx=10") + def create_center_panel(self): @@ -101,7 +115,7 @@ class CenteredWindow(ctk.CTk): tab_view.pack(fill=ctk.BOTH, expand=True) # Crear pestañas y manejar contenido por separado - for tab_name in ["Resultados Scrapping", "Navegador", "Correos", "Juego", "Sistema"]: + for tab_name in ["Scrapping", "Navegador", "Correos", "Juego", "Sistema"]: tab = tab_view.add(tab_name) if tab_name == "Sistema":