Trying to make scrapper work on diferent threads, works but to fast and app sometimes crashes

This commit is contained in:
Dennis Eckerskorn 2024-12-08 20:29:07 +01:00
parent 2c41c9c8b8
commit 55b994f38e
3 changed files with 59 additions and 23 deletions

View File

@ -3,6 +3,7 @@ import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import mysql.connector
from queue import Queue
#http://books.toscrape.com/ test scrap web
@ -12,6 +13,7 @@ class Scrapper:
self.visited_links = set()
self.running=False
self.lock = threading.Lock()
self.link_queue = Queue()
#Configurar la base de datos para los enlaces
self.db_config = {
@ -35,7 +37,8 @@ class Scrapper:
url = self.get_url_from_ui()
if url:
print(f"Iniciando scraping en: {url}")
self.scrape_page(url)
threading.Thread(target=self.scrape_page, args=(url,), daemon=True).start()
threading.Thread(target=self.insert_links_to_db, daemon=True).start()
else:
print("No se proporcionó una URL válida.")
@ -44,6 +47,10 @@ class Scrapper:
self.running = False
print("Scrapping detenido. Proceso finalizado.")
#Vaciar la cola para detener el hilo de inserción
while not self.link_queue.empty():
self.link_queue.get()
# Actualiza la pestaña "Scrapping" con un mensaje
tab = self.ui_instance.tabs["Scrapping"]
text_widget = tab["text_widget"]
@ -67,11 +74,16 @@ class Scrapper:
soup = BeautifulSoup(response.text, "html.parser")
links = [urljoin(url, a.get("href")) for a in soup.find_all("a", href=True)]
self.update_ui(url, links)
self.save_links_to_db(url, links)
for link in links:
if self.running:
threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start()
if not self.running:
break
self.link_queue.put((url, link))
for link in links:
if not self.running:
break
threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start()
else:
print(f"Error al acceder a {url}: {response.status_code}")
except Exception as e:
@ -89,8 +101,39 @@ class Scrapper:
text_widget.see("end")
text_widget.configure(state="disabled")
def insert_links_to_db(self):
"""Inserta los enlaces en la base de datos desde la cola"""
while self.running or not self.link_queue.empty():
try:
# Obtener un enlace de la cola
if not self.running and self.link_queue.empty():
break
parent_url, link = self.link_queue.get(timeout=1) # Espera 1 segundo si la cola está vacía
connection = mysql.connector.connect(**self.db_config)
cursor = connection.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS links (id INT AUTO_INCREMENT PRIMARY KEY, url TEXT, parent_url TEXT)")
cursor.execute("INSERT INTO links (url, parent_url) VALUES (%s, %s)", (link, parent_url))
connection.commit()
cursor.close()
connection.close()
print(f"Enlace guardado: {link} (parent: {parent_url})")
except Exception as e:
print(f"Error al guardar en la base de datos: {e}")
def get_url_from_ui(self):
"""Obtiene la URL desde la interfaz de usuario"""
try:
url_entry = self.ui_instance.left_panel.url_entry
return url_entry.get()
except AttributeError:
print("No se pudo obtener la URL desde la interfaz")
return None
"""
def save_links_to_db(self, url, links):
"""Guarda los enlaces en la base de datos"""
Guarda los enlaces en la base de datos
try:
connection = mysql.connector.connect(**self.db_config)
cursor = connection.cursor()
@ -105,13 +148,6 @@ class Scrapper:
connection.close()
except Exception as e:
print(f"Error al gaurdar en la base de datos: {e}")
"""
def get_url_from_ui(self):
"""Obtiene la URL desde la interfaz de usuario"""
try:
url_entry = self.ui_instance.left_panel.url_entry
return url_entry.get()
except AttributeError:
print("No se pudo obtener la URL desde la interfaz")
return None