151 lines
5.2 KiB
Python
151 lines
5.2 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
from pymongo import MongoClient
|
|
import time
|
|
import tkinter as tk
|
|
from tkinter import scrolledtext
|
|
import threading
|
|
|
|
|
|
def setup_database():
|
|
"""
|
|
Configura la conexión a la base de datos MongoDB y retorna la colección.
|
|
"""
|
|
client = MongoClient("mongodb://localhost:27017/") # Conectar a MongoDB
|
|
db = client["scraping"] # Base de datos llamada 'scraping'
|
|
collection = db["visited_links"] # Colección llamada 'visited_links'
|
|
|
|
# Crear índice único para evitar duplicados
|
|
collection.create_index("url", unique=True)
|
|
|
|
return collection
|
|
|
|
|
|
def insert_visited_link(collection, url):
|
|
"""
|
|
Inserta una URL visitada en la base de datos.
|
|
"""
|
|
try:
|
|
collection.insert_one({"url": url})
|
|
except Exception as e:
|
|
print(f"Error al insertar la URL visitada {url}: {e}")
|
|
|
|
|
|
class ScraperApp:
|
|
def __init__(self, root):
|
|
self.root = root
|
|
self.root.title("Scraper de Enlaces con Hilos")
|
|
self.root.geometry("800x500")
|
|
|
|
# Base de datos MongoDB
|
|
self.collection = setup_database()
|
|
|
|
# Variable para detener el scraping
|
|
self.running = False
|
|
|
|
# Frame para la URL y botón
|
|
frame_top = tk.Frame(self.root)
|
|
frame_top.pack(pady=10)
|
|
|
|
tk.Label(frame_top, text="Introduce la URL inicial:", font=("Arial", 12)).pack(side=tk.LEFT, padx=5)
|
|
self.url_entry = tk.Entry(frame_top, width=50, font=("Arial", 12))
|
|
self.url_entry.pack(side=tk.LEFT, padx=5)
|
|
self.start_button = tk.Button(frame_top, text="Iniciar Scraping", font=("Arial", 12), command=self.start_scraping)
|
|
self.start_button.pack(side=tk.LEFT, padx=5)
|
|
|
|
self.stop_button = tk.Button(frame_top, text="Detener Scraping", font=("Arial", 12), command=self.stop_scraping, state="disabled")
|
|
self.stop_button.pack(side=tk.LEFT, padx=5)
|
|
|
|
# TextArea para los resultados
|
|
self.result_area = scrolledtext.ScrolledText(self.root, wrap=tk.WORD, font=("Arial", 12), height=20, width=90)
|
|
self.result_area.pack(pady=10)
|
|
|
|
def start_scraping(self):
|
|
"""
|
|
Inicia el scraping en un hilo separado.
|
|
"""
|
|
start_url = self.url_entry.get().strip()
|
|
if not start_url:
|
|
self.result_area.insert(tk.END, "Por favor, introduce una URL válida.\n")
|
|
return
|
|
|
|
self.result_area.insert(tk.END, f"Iniciando scraping desde: {start_url}\n")
|
|
self.result_area.see(tk.END)
|
|
|
|
self.running = True
|
|
self.start_button.config(state="disabled")
|
|
self.stop_button.config(state="normal")
|
|
|
|
# Iniciar un hilo para el scraping
|
|
self.scraping_thread = threading.Thread(target=self.scrape_links_forever, args=(start_url,))
|
|
self.scraping_thread.daemon = True # Hilo se detiene cuando la app se cierra
|
|
self.scraping_thread.start()
|
|
|
|
def stop_scraping(self):
|
|
"""
|
|
Detiene el scraping.
|
|
"""
|
|
self.running = False
|
|
self.result_area.insert(tk.END, "Deteniendo scraping...\n")
|
|
self.result_area.see(tk.END)
|
|
self.start_button.config(state="normal")
|
|
self.stop_button.config(state="disabled")
|
|
|
|
def scrape_links_forever(self, start_url):
|
|
"""
|
|
Scrapea enlaces indefinidamente y guarda solo los visitados en MongoDB.
|
|
Se ejecuta en un hilo separado.
|
|
|
|
Args:
|
|
start_url (str): URL inicial para comenzar el scraping.
|
|
"""
|
|
to_visit = [start_url] # Lista de URLs por visitar
|
|
visited = set() # Conjunto para evitar bucles
|
|
|
|
while to_visit and self.running:
|
|
current_url = to_visit.pop(0) # Tomar la URL actual de la lista
|
|
|
|
if current_url in visited:
|
|
continue
|
|
|
|
self.result_area.insert(tk.END, f"Explorando: {current_url}\n")
|
|
self.result_area.see(tk.END)
|
|
visited.add(current_url) # Marcar como visitada
|
|
|
|
try:
|
|
# Realizar la solicitud HTTP
|
|
response = requests.get(current_url, timeout=10)
|
|
response.raise_for_status()
|
|
except requests.RequestException as e:
|
|
self.result_area.insert(tk.END, f"Error al acceder a {current_url}: {e}\n")
|
|
self.result_area.see(tk.END)
|
|
continue
|
|
|
|
# Parsear el contenido HTML
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Encontrar y procesar los enlaces
|
|
for link in soup.find_all('a', href=True):
|
|
full_url = urljoin(current_url, link['href'])
|
|
if full_url not in visited and full_url not in to_visit:
|
|
to_visit.append(full_url)
|
|
|
|
# Insertar la URL visitada en la base de datos
|
|
insert_visited_link(self.collection, current_url)
|
|
|
|
# Retardo para evitar sobrecargar el servidor
|
|
time.sleep(1)
|
|
|
|
# Finalización del scraping
|
|
self.result_area.insert(tk.END, "Scraping finalizado.\n")
|
|
self.result_area.see(tk.END)
|
|
self.start_button.config(state="normal")
|
|
self.stop_button.config(state="disabled")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
root = tk.Tk()
|
|
app = ScraperApp(root)
|
|
root.mainloop()
|