130 lines
3.9 KiB
Python
130 lines
3.9 KiB
Python
import threading
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import mysql.connector
|
|
import queue
|
|
|
|
# Colas para la comunicación entre los hilos
|
|
scraping_queue = queue.Queue()
|
|
link_queue = queue.Queue()
|
|
|
|
|
|
# Hilo A: Realiza el scraping sobre una página web
|
|
def scraping_thread():
|
|
while True:
|
|
url = scraping_queue.get()
|
|
if url is None:
|
|
break
|
|
|
|
print(f"[Hilo A] Scraping de {url}")
|
|
|
|
try:
|
|
response = requests.get(url)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
page_text = soup.get_text()
|
|
links = [initial_url + a['href'] for a in soup.find_all('a', href=True)]
|
|
|
|
# Pasar el texto al hilo C y los enlaces al hilo B
|
|
scraping_data_queue.put((url, page_text))
|
|
link_queue.put(links)
|
|
thread_b = threading.Thread(target=link_processing_thread, daemon=True)
|
|
thread_b.start()
|
|
else:
|
|
print(f"[Hilo A] Error al acceder a {url}: {response.status_code}")
|
|
except Exception as e:
|
|
print(f"[Hilo A] Excepción durante scraping: {str(e)}")
|
|
finally:
|
|
scraping_queue.task_done()
|
|
|
|
# Hilo B: Procesa los enlaces encontrados por el hilo A
|
|
def link_processing_thread():
|
|
while True:
|
|
links = link_queue.get()
|
|
if links is None:
|
|
break
|
|
print(f"[Hilo B] Procesando {len(links)} enlaces")
|
|
|
|
# Guardar los enlaces en la base de datos (hilo D)
|
|
for link in links:
|
|
print(f"[Hilo B] Agregando enlace a link_database_queue: {link}")
|
|
link_database_queue.put(link)
|
|
|
|
link_queue.task_done()
|
|
thread_c = threading.Thread(target=save_to_file_thread, daemon=True)
|
|
thread_c.start()
|
|
|
|
# Hilo C: Guarda la información del scraping en un archivo
|
|
def save_to_file_thread():
|
|
while True:
|
|
url, page_text = scraping_data_queue.get()
|
|
if url is None:
|
|
break
|
|
print(f"[Hilo C] Guardando datos de {url}")
|
|
with open("scraping_output.txt", "a", encoding='utf-8') as file:
|
|
file.write(f"\n\nURL: {url}\n\n{page_text}\n")
|
|
scraping_data_queue.task_done()
|
|
|
|
thread_d = threading.Thread(target=save_to_database_thread)
|
|
thread_d.start()
|
|
|
|
# Hilo D: Guarda los enlaces en la base de datos MySQL y los vuelve a pasar al hilo A
|
|
def save_to_database_thread():
|
|
try:
|
|
# Conexión a la base de datos
|
|
conexion = mysql.connector.connect(
|
|
host="localhost",
|
|
user="thread4",
|
|
password="1234",
|
|
database="thread4"
|
|
)
|
|
|
|
cursor = conexion.cursor()
|
|
link = link_database_queue.get()
|
|
|
|
# Consulta para insertar la cadena
|
|
consulta = "INSERT IGNORE INTO enlaces (enlace) VALUES (%s)"
|
|
cursor.execute(consulta, (link,))
|
|
|
|
# Confirmar la transacción
|
|
conexion.commit()
|
|
|
|
print("Cadena guardada exitosamente.")
|
|
|
|
except mysql.connector.Error as err:
|
|
print(f"Error: {err}")
|
|
|
|
finally:
|
|
if conexion.is_connected():
|
|
cursor.close()
|
|
conexion.close()
|
|
|
|
# Inicializar colas para comunicación
|
|
scraping_data_queue = queue.Queue()
|
|
link_database_queue = queue.Queue()
|
|
|
|
#create_database()
|
|
|
|
|
|
# URLs iniciales para comenzar el scraping
|
|
initial_url = "http://localhost:8081/thread04/"
|
|
scraping_queue.put(initial_url)
|
|
|
|
# Iniciar los hilos
|
|
thread_a = threading.Thread(target=scraping_thread, daemon=True)
|
|
thread_a.start()
|
|
|
|
# Esperar a que terminen las colas
|
|
scraping_queue.join()
|
|
link_queue.join()
|
|
scraping_data_queue.join()
|
|
link_database_queue.join()
|
|
# Detener los hilos una vez finalizadas las tareas
|
|
scraping_queue.put(None)
|
|
link_queue.put(None)
|
|
scraping_data_queue.put(None)
|
|
link_database_queue.put(None)
|
|
|
|
thread_a.join()
|
|
|