153 lines
5.7 KiB
Python
153 lines
5.7 KiB
Python
import threading
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
import mysql.connector
|
|
from queue import Queue
|
|
|
|
#http://books.toscrape.com/ test scrap web
|
|
|
|
class Scrapper:
|
|
def __init__(self, ui_instance):
|
|
self.ui_instance = ui_instance
|
|
self.visited_links = set()
|
|
self.running=False
|
|
self.lock = threading.Lock()
|
|
self.link_queue = Queue()
|
|
|
|
#Configurar la base de datos para los enlaces
|
|
self.db_config = {
|
|
"host": "localhost",
|
|
"user": "root",
|
|
"password": "",
|
|
"database": "scrap_links_db",
|
|
"port": 3306
|
|
}
|
|
|
|
try:
|
|
connection = mysql.connector.connect(**self.db_config)
|
|
print("Conexion exitosa a base de datos")
|
|
connection.close()
|
|
except Exception as e:
|
|
print(f"Error al conectar a la base de datos: {e}")
|
|
|
|
def start_scraping(self):
|
|
"""Inicia el proceso de scraping"""
|
|
self.running = True
|
|
url = self.get_url_from_ui()
|
|
if url:
|
|
print(f"Iniciando scraping en: {url}")
|
|
threading.Thread(target=self.scrape_page, args=(url,), daemon=True).start()
|
|
threading.Thread(target=self.insert_links_to_db, daemon=True).start()
|
|
else:
|
|
print("No se proporcionó una URL válida.")
|
|
|
|
def stop_scraping(self):
|
|
"""Detiene el proceso de scraping"""
|
|
self.running = False
|
|
print("Scrapping detenido. Proceso finalizado.")
|
|
|
|
#Vaciar la cola para detener el hilo de inserción
|
|
while not self.link_queue.empty():
|
|
self.link_queue.get()
|
|
|
|
# Actualiza la pestaña "Scrapping" con un mensaje
|
|
tab = self.ui_instance.tabs["Scrapping"]
|
|
text_widget = tab["text_widget"]
|
|
|
|
text_widget.configure(state="normal")
|
|
text_widget.insert("end", "Scrapping finalizado.\n")
|
|
text_widget.see("end")
|
|
text_widget.configure(state="disabled")
|
|
|
|
def scrape_page(self, url):
|
|
"""Scrapea una web y busca los enlaces"""
|
|
if not self.running or url in self.visited_links:
|
|
return
|
|
|
|
with self.lock:
|
|
self.visited_links.add(url)
|
|
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
links = [urljoin(url, a.get("href")) for a in soup.find_all("a", href=True)]
|
|
self.update_ui(url, links)
|
|
|
|
for link in links:
|
|
if not self.running:
|
|
break
|
|
self.link_queue.put((url, link))
|
|
|
|
for link in links:
|
|
if not self.running:
|
|
break
|
|
threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start()
|
|
else:
|
|
print(f"Error al acceder a {url}: {response.status_code}")
|
|
except Exception as e:
|
|
print(f"Error al scrapear {url}: {e}")
|
|
|
|
def update_ui(self, url, links):
|
|
"""Actualiza la pestaña 'Scrapping' con los enlaces encontrados"""
|
|
tab = self.ui_instance.tabs["Scrapping"]
|
|
text_widget = tab["text_widget"]
|
|
|
|
text_widget.configure(state="normal")
|
|
text_widget.insert("end", f"Enlaces encontrados en {url}:\n")
|
|
for link in links:
|
|
text_widget.insert("end", f" - {link}\n")
|
|
text_widget.see("end")
|
|
text_widget.configure(state="disabled")
|
|
|
|
|
|
|
|
def insert_links_to_db(self):
|
|
"""Inserta los enlaces en la base de datos desde la cola"""
|
|
while self.running or not self.link_queue.empty():
|
|
try:
|
|
# Obtener un enlace de la cola
|
|
if not self.running and self.link_queue.empty():
|
|
break
|
|
parent_url, link = self.link_queue.get(timeout=1) # Espera 1 segundo si la cola está vacía
|
|
connection = mysql.connector.connect(**self.db_config)
|
|
cursor = connection.cursor()
|
|
cursor.execute("CREATE TABLE IF NOT EXISTS links (id INT AUTO_INCREMENT PRIMARY KEY, url TEXT, parent_url TEXT)")
|
|
cursor.execute("INSERT INTO links (url, parent_url) VALUES (%s, %s)", (link, parent_url))
|
|
connection.commit()
|
|
cursor.close()
|
|
connection.close()
|
|
print(f"Enlace guardado: {link} (parent: {parent_url})")
|
|
except Exception as e:
|
|
print(f"Error al guardar en la base de datos: {e}")
|
|
|
|
|
|
def get_url_from_ui(self):
|
|
"""Obtiene la URL desde la interfaz de usuario"""
|
|
try:
|
|
url_entry = self.ui_instance.left_panel.url_entry
|
|
return url_entry.get()
|
|
except AttributeError:
|
|
print("No se pudo obtener la URL desde la interfaz")
|
|
return None
|
|
"""
|
|
def save_links_to_db(self, url, links):
|
|
Guarda los enlaces en la base de datos
|
|
try:
|
|
connection = mysql.connector.connect(**self.db_config)
|
|
cursor = connection.cursor()
|
|
cursor.execute("CREATE TABLE IF NOT EXISTS links (id INT AUTO_INCREMENT PRIMARY KEY, url TEXT, parent_url TEXT)")
|
|
|
|
for link in links:
|
|
print(f"Guardando enlace: {link} (parent: {url})") # Verifica los datos
|
|
cursor.execute("INSERT INTO links (url, parent_url) VALUES (%s, %s)", (link, url))
|
|
|
|
connection.commit()
|
|
cursor.close()
|
|
connection.close()
|
|
except Exception as e:
|
|
print(f"Error al gaurdar en la base de datos: {e}")
|
|
"""
|
|
|
|
|