ProjectKevin/pruebas.py

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pymongo import MongoClient
import time
import tkinter as tk
from tkinter import scrolledtext
import threading


def setup_database():
    """
    Configura la conexión a la base de datos MongoDB y retorna la colección.
    """
    client = MongoClient("mongodb://localhost:27017/")  # Conectar a MongoDB
    db = client["scraping"]  # Base de datos llamada 'scraping'
    collection = db["visited_links"]  # Colección llamada 'visited_links'

    # Crear índice único para evitar duplicados
    collection.create_index("url", unique=True)

    return collection


def insert_visited_link(collection, url):
    """
    Inserta una URL visitada en la base de datos.
    """
    try:
        collection.insert_one({"url": url})
    except Exception as e:
        print(f"Error al insertar la URL visitada {url}: {e}")


class ScraperApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Scraper de Enlaces con Hilos")
        self.root.geometry("800x500")

        # Base de datos MongoDB
        self.collection = setup_database()

        # Variable para detener el scraping
        self.running = False

        # Frame para la URL y botón
        frame_top = tk.Frame(self.root)
        frame_top.pack(pady=10)

        tk.Label(frame_top, text="Introduce la URL inicial:", font=("Arial", 12)).pack(side=tk.LEFT, padx=5)
        self.url_entry = tk.Entry(frame_top, width=50, font=("Arial", 12))
        self.url_entry.pack(side=tk.LEFT, padx=5)
        self.start_button = tk.Button(frame_top, text="Iniciar Scraping", font=("Arial", 12), command=self.start_scraping)
        self.start_button.pack(side=tk.LEFT, padx=5)

        self.stop_button = tk.Button(frame_top, text="Detener Scraping", font=("Arial", 12), command=self.stop_scraping, state="disabled")
        self.stop_button.pack(side=tk.LEFT, padx=5)

        # TextArea para los resultados
        self.result_area = scrolledtext.ScrolledText(self.root, wrap=tk.WORD, font=("Arial", 12), height=20, width=90)
        self.result_area.pack(pady=10)

    def start_scraping(self):
        """
        Inicia el scraping en un hilo separado.
        """
        start_url = self.url_entry.get().strip()
        if not start_url:
            self.result_area.insert(tk.END, "Por favor, introduce una URL válida.\n")
            return

        self.result_area.insert(tk.END, f"Iniciando scraping desde: {start_url}\n")
        self.result_area.see(tk.END)

        self.running = True
        self.start_button.config(state="disabled")
        self.stop_button.config(state="normal")

        # Iniciar un hilo para el scraping
        self.scraping_thread = threading.Thread(target=self.scrape_links_forever, args=(start_url,))
        self.scraping_thread.daemon = True  # Hilo se detiene cuando la app se cierra
        self.scraping_thread.start()

    def stop_scraping(self):
        """
        Detiene el scraping.
        """
        self.running = False
        self.result_area.insert(tk.END, "Deteniendo scraping...\n")
        self.result_area.see(tk.END)
        self.start_button.config(state="normal")
        self.stop_button.config(state="disabled")

    def scrape_links_forever(self, start_url):
        """
        Scrapea enlaces indefinidamente y guarda solo los visitados en MongoDB.
        Se ejecuta en un hilo separado.

        Args:
            start_url (str): URL inicial para comenzar el scraping.
        """
        to_visit = [start_url]  # Lista de URLs por visitar
        visited = set()  # Conjunto para evitar bucles

        while to_visit and self.running:
            current_url = to_visit.pop(0)  # Tomar la URL actual de la lista

            if current_url in visited:
                continue

            self.result_area.insert(tk.END, f"Explorando: {current_url}\n")
            self.result_area.see(tk.END)
            visited.add(current_url)  # Marcar como visitada

            try:
                # Realizar la solicitud HTTP
                response = requests.get(current_url, timeout=10)
                response.raise_for_status()
            except requests.RequestException as e:
                self.result_area.insert(tk.END, f"Error al acceder a {current_url}: {e}\n")
                self.result_area.see(tk.END)
                continue

            # Parsear el contenido HTML
            soup = BeautifulSoup(response.text, 'html.parser')

            # Encontrar y procesar los enlaces
            for link in soup.find_all('a', href=True):
                full_url = urljoin(current_url, link['href'])
                if full_url not in visited and full_url not in to_visit:
                    to_visit.append(full_url)

            # Insertar la URL visitada en la base de datos
            insert_visited_link(self.collection, current_url)

            # Retardo para evitar sobrecargar el servidor
            time.sleep(1)

        # Finalización del scraping
        self.result_area.insert(tk.END, "Scraping finalizado.\n")
        self.result_area.see(tk.END)
        self.start_button.config(state="normal")
        self.stop_button.config(state="disabled")


if __name__ == "__main__":
    root = tk.Tk()
    app = ScraperApp(root)
    root.mainloop()