diff --git a/src/services/__pycache__/scrapper.cpython-313.pyc b/src/services/__pycache__/scrapper.cpython-313.pyc index 6a04ec1..a37199b 100644 Binary files a/src/services/__pycache__/scrapper.cpython-313.pyc and b/src/services/__pycache__/scrapper.cpython-313.pyc differ diff --git a/src/services/scrapper.py b/src/services/scrapper.py index a1594e0..3c477de 100644 --- a/src/services/scrapper.py +++ b/src/services/scrapper.py @@ -3,6 +3,7 @@ import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import mysql.connector +from queue import Queue #http://books.toscrape.com/ test scrap web @@ -12,6 +13,7 @@ class Scrapper: self.visited_links = set() self.running=False self.lock = threading.Lock() + self.link_queue = Queue() #Configurar la base de datos para los enlaces self.db_config = { @@ -34,8 +36,9 @@ class Scrapper: self.running = True url = self.get_url_from_ui() if url: - print(f"Iniciando scraping en: {url}") - self.scrape_page(url) + print(f"Iniciando scraping en: {url}") + threading.Thread(target=self.scrape_page, args=(url,), daemon=True).start() + threading.Thread(target=self.insert_links_to_db, daemon=True).start() else: print("No se proporcionó una URL válida.") @@ -44,6 +47,10 @@ class Scrapper: self.running = False print("Scrapping detenido. Proceso finalizado.") + #Vaciar la cola para detener el hilo de inserción + while not self.link_queue.empty(): + self.link_queue.get() + # Actualiza la pestaña "Scrapping" con un mensaje tab = self.ui_instance.tabs["Scrapping"] text_widget = tab["text_widget"] @@ -67,30 +74,66 @@ class Scrapper: soup = BeautifulSoup(response.text, "html.parser") links = [urljoin(url, a.get("href")) for a in soup.find_all("a", href=True)] self.update_ui(url, links) - self.save_links_to_db(url, links) for link in links: - if self.running: - threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start() + if not self.running: + break + self.link_queue.put((url, link)) + + for link in links: + if not self.running: + break + threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start() else: print(f"Error al acceder a {url}: {response.status_code}") except Exception as e: print(f"Error al scrapear {url}: {e}") - def update_ui(self, url, links): - """Actualiza la pestaña 'Scrapping' con los enlaces encontrados""" - tab = self.ui_instance.tabs["Scrapping"] - text_widget = tab["text_widget"] + def update_ui(self, url, links): + """Actualiza la pestaña 'Scrapping' con los enlaces encontrados""" + tab = self.ui_instance.tabs["Scrapping"] + text_widget = tab["text_widget"] - text_widget.configure(state="normal") - text_widget.insert("end", f"Enlaces encontrados en {url}:\n") - for link in links: + text_widget.configure(state="normal") + text_widget.insert("end", f"Enlaces encontrados en {url}:\n") + for link in links: text_widget.insert("end", f" - {link}\n") - text_widget.see("end") + text_widget.see("end") text_widget.configure(state="disabled") + + + def insert_links_to_db(self): + """Inserta los enlaces en la base de datos desde la cola""" + while self.running or not self.link_queue.empty(): + try: + # Obtener un enlace de la cola + if not self.running and self.link_queue.empty(): + break + parent_url, link = self.link_queue.get(timeout=1) # Espera 1 segundo si la cola está vacía + connection = mysql.connector.connect(**self.db_config) + cursor = connection.cursor() + cursor.execute("CREATE TABLE IF NOT EXISTS links (id INT AUTO_INCREMENT PRIMARY KEY, url TEXT, parent_url TEXT)") + cursor.execute("INSERT INTO links (url, parent_url) VALUES (%s, %s)", (link, parent_url)) + connection.commit() + cursor.close() + connection.close() + print(f"Enlace guardado: {link} (parent: {parent_url})") + except Exception as e: + print(f"Error al guardar en la base de datos: {e}") + + + def get_url_from_ui(self): + """Obtiene la URL desde la interfaz de usuario""" + try: + url_entry = self.ui_instance.left_panel.url_entry + return url_entry.get() + except AttributeError: + print("No se pudo obtener la URL desde la interfaz") + return None +""" def save_links_to_db(self, url, links): - """Guarda los enlaces en la base de datos""" + Guarda los enlaces en la base de datos try: connection = mysql.connector.connect(**self.db_config) cursor = connection.cursor() @@ -105,13 +148,6 @@ class Scrapper: connection.close() except Exception as e: print(f"Error al gaurdar en la base de datos: {e}") - - def get_url_from_ui(self): - """Obtiene la URL desde la interfaz de usuario""" - try: - url_entry = self.ui_instance.left_panel.url_entry - return url_entry.get() - except AttributeError: - print("No se pudo obtener la URL desde la interfaz") - return None +""" + \ No newline at end of file diff --git a/src/ui/__pycache__/centered_window.cpython-313.pyc b/src/ui/__pycache__/centered_window.cpython-313.pyc index 3ba83e6..9421f41 100644 Binary files a/src/ui/__pycache__/centered_window.cpython-313.pyc and b/src/ui/__pycache__/centered_window.cpython-313.pyc differ