Trying to make scrapper work on diferent threads, works but to fast and app sometimes crashes
This commit is contained in:
parent
2c41c9c8b8
commit
55b994f38e
Binary file not shown.
|
@ -3,6 +3,7 @@ import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
import mysql.connector
|
import mysql.connector
|
||||||
|
from queue import Queue
|
||||||
|
|
||||||
#http://books.toscrape.com/ test scrap web
|
#http://books.toscrape.com/ test scrap web
|
||||||
|
|
||||||
|
@ -12,6 +13,7 @@ class Scrapper:
|
||||||
self.visited_links = set()
|
self.visited_links = set()
|
||||||
self.running=False
|
self.running=False
|
||||||
self.lock = threading.Lock()
|
self.lock = threading.Lock()
|
||||||
|
self.link_queue = Queue()
|
||||||
|
|
||||||
#Configurar la base de datos para los enlaces
|
#Configurar la base de datos para los enlaces
|
||||||
self.db_config = {
|
self.db_config = {
|
||||||
|
@ -35,7 +37,8 @@ class Scrapper:
|
||||||
url = self.get_url_from_ui()
|
url = self.get_url_from_ui()
|
||||||
if url:
|
if url:
|
||||||
print(f"Iniciando scraping en: {url}")
|
print(f"Iniciando scraping en: {url}")
|
||||||
self.scrape_page(url)
|
threading.Thread(target=self.scrape_page, args=(url,), daemon=True).start()
|
||||||
|
threading.Thread(target=self.insert_links_to_db, daemon=True).start()
|
||||||
else:
|
else:
|
||||||
print("No se proporcionó una URL válida.")
|
print("No se proporcionó una URL válida.")
|
||||||
|
|
||||||
|
@ -44,6 +47,10 @@ class Scrapper:
|
||||||
self.running = False
|
self.running = False
|
||||||
print("Scrapping detenido. Proceso finalizado.")
|
print("Scrapping detenido. Proceso finalizado.")
|
||||||
|
|
||||||
|
#Vaciar la cola para detener el hilo de inserción
|
||||||
|
while not self.link_queue.empty():
|
||||||
|
self.link_queue.get()
|
||||||
|
|
||||||
# Actualiza la pestaña "Scrapping" con un mensaje
|
# Actualiza la pestaña "Scrapping" con un mensaje
|
||||||
tab = self.ui_instance.tabs["Scrapping"]
|
tab = self.ui_instance.tabs["Scrapping"]
|
||||||
text_widget = tab["text_widget"]
|
text_widget = tab["text_widget"]
|
||||||
|
@ -67,11 +74,16 @@ class Scrapper:
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
links = [urljoin(url, a.get("href")) for a in soup.find_all("a", href=True)]
|
links = [urljoin(url, a.get("href")) for a in soup.find_all("a", href=True)]
|
||||||
self.update_ui(url, links)
|
self.update_ui(url, links)
|
||||||
self.save_links_to_db(url, links)
|
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
if self.running:
|
if not self.running:
|
||||||
threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start()
|
break
|
||||||
|
self.link_queue.put((url, link))
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
if not self.running:
|
||||||
|
break
|
||||||
|
threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start()
|
||||||
else:
|
else:
|
||||||
print(f"Error al acceder a {url}: {response.status_code}")
|
print(f"Error al acceder a {url}: {response.status_code}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -89,8 +101,39 @@ class Scrapper:
|
||||||
text_widget.see("end")
|
text_widget.see("end")
|
||||||
text_widget.configure(state="disabled")
|
text_widget.configure(state="disabled")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def insert_links_to_db(self):
|
||||||
|
"""Inserta los enlaces en la base de datos desde la cola"""
|
||||||
|
while self.running or not self.link_queue.empty():
|
||||||
|
try:
|
||||||
|
# Obtener un enlace de la cola
|
||||||
|
if not self.running and self.link_queue.empty():
|
||||||
|
break
|
||||||
|
parent_url, link = self.link_queue.get(timeout=1) # Espera 1 segundo si la cola está vacía
|
||||||
|
connection = mysql.connector.connect(**self.db_config)
|
||||||
|
cursor = connection.cursor()
|
||||||
|
cursor.execute("CREATE TABLE IF NOT EXISTS links (id INT AUTO_INCREMENT PRIMARY KEY, url TEXT, parent_url TEXT)")
|
||||||
|
cursor.execute("INSERT INTO links (url, parent_url) VALUES (%s, %s)", (link, parent_url))
|
||||||
|
connection.commit()
|
||||||
|
cursor.close()
|
||||||
|
connection.close()
|
||||||
|
print(f"Enlace guardado: {link} (parent: {parent_url})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error al guardar en la base de datos: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_url_from_ui(self):
|
||||||
|
"""Obtiene la URL desde la interfaz de usuario"""
|
||||||
|
try:
|
||||||
|
url_entry = self.ui_instance.left_panel.url_entry
|
||||||
|
return url_entry.get()
|
||||||
|
except AttributeError:
|
||||||
|
print("No se pudo obtener la URL desde la interfaz")
|
||||||
|
return None
|
||||||
|
"""
|
||||||
def save_links_to_db(self, url, links):
|
def save_links_to_db(self, url, links):
|
||||||
"""Guarda los enlaces en la base de datos"""
|
Guarda los enlaces en la base de datos
|
||||||
try:
|
try:
|
||||||
connection = mysql.connector.connect(**self.db_config)
|
connection = mysql.connector.connect(**self.db_config)
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
|
@ -105,13 +148,6 @@ class Scrapper:
|
||||||
connection.close()
|
connection.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error al gaurdar en la base de datos: {e}")
|
print(f"Error al gaurdar en la base de datos: {e}")
|
||||||
|
"""
|
||||||
|
|
||||||
def get_url_from_ui(self):
|
|
||||||
"""Obtiene la URL desde la interfaz de usuario"""
|
|
||||||
try:
|
|
||||||
url_entry = self.ui_instance.left_panel.url_entry
|
|
||||||
return url_entry.get()
|
|
||||||
except AttributeError:
|
|
||||||
print("No se pudo obtener la URL desde la interfaz")
|
|
||||||
return None
|
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue