ProyectFinalServices/models/Scraper.py

64 lines
2.0 KiB
Python

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pymongo import MongoClient
class ScraperModel:
def __init__(self):
self.to_visit = []
self.visited = set()
# Conexión a MongoDB
self.client = MongoClient("mongodb://localhost:27017/")
self.db = self.client["scraping"]
self.collection = self.db["visited_links"]
# Crear índice único para evitar duplicados
self.collection.create_index("url", unique=True)
def add_url(self, url):
"""Añade una URL a la lista de pendientes."""
if url not in self.visited and url not in self.to_visit:
self.to_visit.append(url)
def scrape_next_url(self):
"""Scrapea la siguiente URL."""
if not self.to_visit:
return None, []
current_url = self.to_visit.pop(0)
self.visited.add(current_url)
try:
# Solicitar la URL
response = requests.get(current_url, timeout=10)
response.raise_for_status()
except requests.RequestException as e:
return current_url, f"Error al acceder a {current_url}: {e}"
# Procesar los enlaces encontrados
soup = BeautifulSoup(response.text, 'html.parser')
found_links = []
for link in soup.find_all('a', href=True):
full_url = urljoin(current_url, link['href'])
if full_url not in self.visited and full_url not in self.to_visit:
self.to_visit.append(full_url)
found_links.append(full_url)
# Guardar URL visitada en MongoDB
self.save_to_database(current_url)
return current_url, found_links
def save_to_database(self, url):
"""Guarda la URL visitada en la base de datos."""
try:
self.collection.insert_one({"url": url})
except Exception as e:
print(f"Error al guardar en la base de datos: {e}")
def has_pending_urls(self):
"""Verifica si hay URLs pendientes."""
return bool(self.to_visit)