Updated Project, reads files and extracts links, but problems with threads

2024-10-31 12:16:05 +01:00 · 2024-10-31 12:16:05 +01:00 · 220045b92b
parent 902f347771
commit 220045b92b
9 changed files with 158 additions and 1 deletions
--- a/Ejercicio04_WebScrapper/Main.py
+++ b/Ejercicio04_WebScrapper/Main.py
@ -0,0 +1,69 @@
+import os
+import threading
+import queue
+import time
+from WebFileReader import WebFileReader
+
+print("Directorio actual:", os.getcwd())
+
+# Inicializa colas para la comunciación entre hilos
+data_queue = queue.Queue()
+link_queue = queue.Queue()
+
+# Instancia de WebFileReader
+reader = WebFileReader()
+
+# Hilo A: Lee el archivos HTML y los coloca en data_queue
+def hilo_a():
+    filenames = ["index.html", "1.html", "2.html"]
+    for filename in filenames:
+        print(f"[Hilo A] Leyendo archivo: {filename}")
+        content = reader.read_file(filename)
+        if content:
+            data_queue.put((filename, content))
+        time.sleep(1) #Simulacion del tiempo de espera
+
+# Hilo B: Extrae los enlaces del contenido HTML y los coloca en link_queue
+def hilo_b():
+    while True:
+        if not data_queue.empty():
+            filename, html_content = data_queue.get()
+            print(f"[Hilo B] Extrayendo enlaces de: {filename}")
+            links = reader.extract_links(html_content)
+            for link in links:
+                link_queue.put(link)
+
+# Hilo C: Guarda el contenido de los archivos en archivos de texto en la carpeta resources
+def hilo_c():
+    while True:
+        if not data_queue.empty():
+            filename, html_content = data_queue.get()
+            save_path = f"EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/{filename.replace('.html', '')}_content.txt"
+            with open(save_path, 'w', encoding='utf-8') as file:
+                file.write(html_content)
+                print(f"[Hilo C] Guardando contenido de {filename} en {save_path}")
+
+# Hilo D: Guarda los enlaces extraídos en un archivo de texto para análisis
+def hilo_d():
+    while True:
+        if not link_queue.empty():
+            link = link_queue.get()
+            with open("EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/extracted_links.txt", 'a', encoding='utf-8') as file:
+                file.write(link + "\n")
+            print(f"[Hilo D] Enlace guardado: {link}")
+
+thread_a = threading.Thread(target=hilo_a, daemon=True)
+thread_b = threading.Thread(target=hilo_b, daemon=True)
+thread_c = threading.Thread(target=hilo_c, daemon=True)
+thread_d = threading.Thread(target=hilo_d, daemon=True)
+
+thread_a.start()
+thread_b.start()
+thread_c.start()
+thread_d.start()
+
+try:
+    while True:
+        time.sleep(1)
+except KeyboardInterrupt:
+    print("Programa terminado")
--- a/Ejercicio04_WebScrapper/WebFileReader.py
+++ b/Ejercicio04_WebScrapper/WebFileReader.py
@ -1 +1,31 @@
-import mysql.connector
+import os
+from bs4 import BeautifulSoup
+
+class WebFileReader:
+    def __init__(self, base_path="EjerciciosConHilos/Ejercicio04_WebScrapper/resources/input_html"):
+        self.base_path = base_path
+
+    def read_file(self, filename):
+        """
+        Lee un archivo HTML y devuelve su contenido como texto.
+        """
+        filepath = os.path.join(self.base_path, filename)
+        try:
+            with open(filepath, 'r', encoding='utf-8') as file:
+                return file.read()
+        except FileNotFoundError:
+            print(f"[WebFileReader] Archivo no encontrado: {filepath}")
+            return None
+        except Exception as e:
+            print(f"[WebFileReader] Error al leer el archivo {filepath}: {str(e)}")
+            return None
+        
+    def extract_links(self, html_content):
+        """
+        Extrae todos los enlaces de un contenido HTML dado.
+        """
+        soup = BeautifulSoup(html_content, 'html.parser')
+        links = [a['href'] for a in soup.find_all('a', href=True)]
+        return links
+
+
--- a/Ejercicio04_WebScrapper/pycache/WebFileReader.cpython-312.pyc
+++ b/Ejercicio04_WebScrapper/pycache/WebFileReader.cpython-312.pyc
--- a/Ejercicio04_WebScrapper/resources/input_html/1.html
+++ b/Ejercicio04_WebScrapper/resources/input_html/1.html
@ -0,0 +1,11 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Página 1</title>
+    </head>
+    <BODY>
+        <a href="2.html">Ir a página 2</a>
+        <br>
+        <a href="index.html">Volver</a>
+    </BODY>
+</html>
--- a/Ejercicio04_WebScrapper/resources/input_html/2.html
+++ b/Ejercicio04_WebScrapper/resources/input_html/2.html
@ -0,0 +1,11 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Página 2</title>
+    </head>
+    <BODY>
+        <a href="1.html">Ir a página 1</a>
+        <br>
+        <a href="index.html">Volver</a>
+    </BODY>
+</html>
--- a/Ejercicio04_WebScrapper/resources/input_html/index.html
+++ b/Ejercicio04_WebScrapper/resources/input_html/index.html
@ -0,0 +1,9 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Inicio</title>
+    </head>
+    <BODY>
+        <a href="1.html">Ir a página 1</a>
+    </BODY>
+</html>
--- a/Ejercicio04_WebScrapper/resources/output/1_content.txt
+++ b/Ejercicio04_WebScrapper/resources/output/1_content.txt
@ -0,0 +1,11 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Página 1</title>
+    </head>
+    <BODY>
+        <a href="2.html">Ir a página 2</a>
+        <br>
+        <a href="index.html">Volver</a>
+    </BODY>
+</html>
--- a/Ejercicio04_WebScrapper/resources/output/extracted_links.txt
+++ b/Ejercicio04_WebScrapper/resources/output/extracted_links.txt
@ -0,0 +1,7 @@
+2.html
+index.html
+1.html
+index.html
+1.html
+1.html
+index.html
--- a/Ejercicio04_WebScrapper/resources/output/index_content.txt
+++ b/Ejercicio04_WebScrapper/resources/output/index_content.txt
@ -0,0 +1,9 @@
+<html lang="es">
+    <head>
+        <meta charset="UTF-8">
+        <title>Inicio</title>
+    </head>
+    <BODY>
+        <a href="1.html">Ir a página 1</a>
+    </BODY>
+</html>