import requests from bs4 import BeautifulSoup from urllib.parse import urljoin from pymongo import MongoClient class ScraperModel: def __init__(self): self.to_visit = [] self.visited = set() # Conexión a MongoDB self.client = MongoClient("mongodb://localhost:27017/") self.db = self.client["scraping"] self.collection = self.db["visited_links"] # Crear índice único para evitar duplicados self.collection.create_index("url", unique=True) def add_url(self, url): """Añade una URL a la lista de pendientes.""" if url not in self.visited and url not in self.to_visit: self.to_visit.append(url) def scrape_next_url(self): """Scrapea la siguiente URL.""" if not self.to_visit: return None, [] current_url = self.to_visit.pop(0) self.visited.add(current_url) try: # Solicitar la URL response = requests.get(current_url, timeout=10) response.raise_for_status() except requests.RequestException as e: return current_url, f"Error al acceder a {current_url}: {e}" # Procesar los enlaces encontrados soup = BeautifulSoup(response.text, 'html.parser') found_links = [] for link in soup.find_all('a', href=True): full_url = urljoin(current_url, link['href']) if full_url not in self.visited and full_url not in self.to_visit: self.to_visit.append(full_url) found_links.append(full_url) # Guardar URL visitada en MongoDB self.save_to_database(current_url) return current_url, found_links def save_to_database(self, url): """Guarda la URL visitada en la base de datos.""" try: self.collection.insert_one({"url": url}) except Exception as e: print(f"Error al guardar en la base de datos: {e}") def has_pending_urls(self): """Verifica si hay URLs pendientes.""" return bool(self.to_visit)