64 lines
2.0 KiB
Python
64 lines
2.0 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
from pymongo import MongoClient
|
|
|
|
|
|
class ScraperModel:
|
|
def __init__(self):
|
|
self.to_visit = []
|
|
self.visited = set()
|
|
|
|
# Conexión a MongoDB
|
|
self.client = MongoClient("mongodb://localhost:27017/")
|
|
self.db = self.client["scraping"]
|
|
self.collection = self.db["visited_links"]
|
|
|
|
# Crear índice único para evitar duplicados
|
|
self.collection.create_index("url", unique=True)
|
|
|
|
def add_url(self, url):
|
|
"""Añade una URL a la lista de pendientes."""
|
|
if url not in self.visited and url not in self.to_visit:
|
|
self.to_visit.append(url)
|
|
|
|
def scrape_next_url(self):
|
|
"""Scrapea la siguiente URL."""
|
|
if not self.to_visit:
|
|
return None, []
|
|
|
|
current_url = self.to_visit.pop(0)
|
|
self.visited.add(current_url)
|
|
|
|
try:
|
|
# Solicitar la URL
|
|
response = requests.get(current_url, timeout=10)
|
|
response.raise_for_status()
|
|
except requests.RequestException as e:
|
|
return current_url, f"Error al acceder a {current_url}: {e}"
|
|
|
|
# Procesar los enlaces encontrados
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
found_links = []
|
|
|
|
for link in soup.find_all('a', href=True):
|
|
full_url = urljoin(current_url, link['href'])
|
|
if full_url not in self.visited and full_url not in self.to_visit:
|
|
self.to_visit.append(full_url)
|
|
found_links.append(full_url)
|
|
|
|
# Guardar URL visitada en MongoDB
|
|
self.save_to_database(current_url)
|
|
return current_url, found_links
|
|
|
|
def save_to_database(self, url):
|
|
"""Guarda la URL visitada en la base de datos."""
|
|
try:
|
|
self.collection.insert_one({"url": url})
|
|
except Exception as e:
|
|
print(f"Error al guardar en la base de datos: {e}")
|
|
|
|
def has_pending_urls(self):
|
|
"""Verifica si hay URLs pendientes."""
|
|
return bool(self.to_visit)
|