From ab067470698f5e31e86fd601dfb94422a050b05b Mon Sep 17 00:00:00 2001 From: Pau Date: Thu, 31 Oct 2024 20:42:07 +0100 Subject: [PATCH] everything corrected except connection to database --- threads04/scraping.py | 109 +++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 60 deletions(-) diff --git a/threads04/scraping.py b/threads04/scraping.py index d45c222..5322057 100644 --- a/threads04/scraping.py +++ b/threads04/scraping.py @@ -8,13 +8,6 @@ import queue scraping_queue = queue.Queue() link_queue = queue.Queue() -# Conexión a la base de datos MySQL -db_conn = mysql.connector.connect( - host="localhost", # Cambia según tu configuración de MySQL - user="thread04", # Tu usuario de MySQL - password="1234", # Tu contraseña de MySQL - database="thread04" # Nombre de la base de datos -) # Hilo A: Realiza el scraping sobre una página web def scraping_thread(): @@ -22,6 +15,7 @@ def scraping_thread(): url = scraping_queue.get() if url is None: break + print(f"[Hilo A] Scraping de {url}") try: @@ -29,11 +23,13 @@ def scraping_thread(): if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') page_text = soup.get_text() - links = [a['href'] for a in soup.find_all('a', href=True)] + links = [initial_url + a['href'] for a in soup.find_all('a', href=True)] # Pasar el texto al hilo C y los enlaces al hilo B scraping_data_queue.put((url, page_text)) link_queue.put(links) + thread_b = threading.Thread(target=link_processing_thread, daemon=True) + thread_b.start() else: print(f"[Hilo A] Error al acceder a {url}: {response.status_code}") except Exception as e: @@ -51,9 +47,12 @@ def link_processing_thread(): # Guardar los enlaces en la base de datos (hilo D) for link in links: + print(f"[Hilo B] Agregando enlace a link_database_queue: {link}") link_database_queue.put(link) link_queue.task_done() + thread_c = threading.Thread(target=save_to_file_thread, daemon=True) + thread_c.start() # Hilo C: Guarda la información del scraping en un archivo def save_to_file_thread(): @@ -66,74 +65,64 @@ def save_to_file_thread(): file.write(f"\n\nURL: {url}\n\n{page_text}\n") scraping_data_queue.task_done() + thread_d = threading.Thread(target=save_to_database_thread) + thread_d.start() + # Hilo D: Guarda los enlaces en la base de datos MySQL y los vuelve a pasar al hilo A def save_to_database_thread(): - while True: - link = link_database_queue.get() - if link is None: - break - print(f"[Hilo D] Guardando enlace en base de datos: {link}") - cursor = db_conn.cursor() + try: + db_conn = mysql.connector.connect( + host="localhost", + user="thread04", + password="1234", + database="thread04", + port=3307 + ) + while True: + link = link_database_queue.get() + print(link) + if link is None: + break + print(f"[Hilo D] Guardando enlace en base de datos: {link}") + cursor = db_conn.cursor() + - try: cursor.execute("INSERT INTO enlaces (url) VALUES (%s)", (link,)) db_conn.commit() - + # Añadir el enlace a la cola de scraping para ser procesado por el hilo A scraping_queue.put(link) - except mysql.connector.Error as err: - print(f"[Hilo D] Error de base de datos: {err}") - finally: - cursor.close() + except Exception as err: + print(f"[Hilo D] Error de base de datos: {err}") + finally: + cursor.close() link_database_queue.task_done() # Inicializar colas para comunicación scraping_data_queue = queue.Queue() link_database_queue = queue.Queue() -# Crear tablas si no existen -def create_database(): - cursor = db_conn.cursor() - cursor.execute(""" - CREATE TABLE IF NOT EXISTS enlaces ( - id INT AUTO_INCREMENT PRIMARY KEY, - url VARCHAR(255) NOT NULL - ) - """) - db_conn.commit() - cursor.close() +#create_database() -# Iniciar el programa -def main(): - create_database() - # URLs iniciales para comenzar el scraping - initial_url = "http://localhost:8081/thread04/index.html" +# URLs iniciales para comenzar el scraping +initial_url = "http://localhost:8081/thread04/" +scraping_queue.put(initial_url) - scraping_queue.put(initial_url) - - # Iniciar los hilos - thread_a = threading.Thread(target=scraping_thread, daemon=True) - thread_b = threading.Thread(target=link_processing_thread, daemon=True) - thread_c = threading.Thread(target=save_to_file_thread, daemon=True) - thread_d = threading.Thread(target=save_to_database_thread, daemon=True) - - thread_a.start() - thread_b.start() - thread_c.start() - thread_d.start() +# Iniciar los hilos +thread_a = threading.Thread(target=scraping_thread, daemon=True) +thread_a.start() - # Esperar a que terminen las colas - scraping_queue.join() - link_queue.join() - scraping_data_queue.join() - link_database_queue.join() +# Esperar a que terminen las colas +scraping_queue.join() +link_queue.join() +scraping_data_queue.join() +link_database_queue.join() +# Detener los hilos una vez finalizadas las tareas +scraping_queue.put(None) +link_queue.put(None) +scraping_data_queue.put(None) +link_database_queue.put(None) - # Detener los hilos una vez finalizadas las tareas - scraping_queue.put(None) - link_queue.put(None) - scraping_data_queue.put(None) - link_database_queue.put(None) +thread_a.join() -if __name__ == "__main__": - main()