Extracts links

2024-11-14 14:24:52 +01:00 · 2024-11-14 14:24:52 +01:00 · 7318501edb
parent 220045b92b
commit 7318501edb
10 changed files with 167 additions and 81 deletions
--- a/Ejercicio04_WebScrapper/Main.py
+++ b/Ejercicio04_WebScrapper/Main.py
@ -1,69 +1,97 @@
 import os
 import threading
 import queue
-import time
 from WebFileReader import WebFileReader

-print("Directorio actual:", os.getcwd())
+# Ruta base de los archivos HTML
+base_path = "EjerciciosConHilos/Ejercicio04_WebScrapper/resources/input_html"
+output_dir = "EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output"
+output_file = os.path.join(output_dir, "extracted_links.txt")

-# Inicializa colas para la comunciación entre hilos
-data_queue = queue.Queue()
+# Asegurarse de que la carpeta de salida existe
+os.makedirs(output_dir, exist_ok=True)
+
+# Cola de enlaces pendientes de procesar y conjunto de enlaces ya procesados
 link_queue = queue.Queue()
+processed_links = set()
+lock = threading.Lock()  # Para sincronizar el acceso a `processed_links`

-# Instancia de WebFileReader
-reader = WebFileReader()
+# Instancia de WebFileReader para leer HTML y extraer enlaces
+reader = WebFileReader(base_path=base_path)

-# Hilo A: Lee el archivos HTML y los coloca en data_queue
-def hilo_a():
-    filenames = ["index.html", "1.html", "2.html"]
-    for filename in filenames:
-        print(f"[Hilo A] Leyendo archivo: {filename}")
-        content = reader.read_file(filename)
-        if content:
-            data_queue.put((filename, content))
-        time.sleep(1) #Simulacion del tiempo de espera
+# Método adicional en WebFileReader para guardar los enlaces
+def guardar_enlaces(enlaces):
+    """
+    Guarda los enlaces extraídos en un archivo.
+    """
+    with open(output_file, 'a', encoding='utf-8') as file:
+        for enlace in enlaces:
+            file.write(enlace + "\n")
+    print(f"[WebFileReader] Enlaces guardados en {output_file}")

-# Hilo B: Extrae los enlaces del contenido HTML y los coloca en link_queue
-def hilo_b():
+# Función que explora enlaces y descubre nuevos
+def procesar_enlace():
    while True:
-        if not data_queue.empty():
-            filename, html_content = data_queue.get()
-            print(f"[Hilo B] Extrayendo enlaces de: {filename}")
-            links = reader.extract_links(html_content)
-            for link in links:
-                link_queue.put(link)
-
-# Hilo C: Guarda el contenido de los archivos en archivos de texto en la carpeta resources
-def hilo_c():
-    while True:
-        if not data_queue.empty():
-            filename, html_content = data_queue.get()
-            save_path = f"EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/{filename.replace('.html', '')}_content.txt"
-            with open(save_path, 'w', encoding='utf-8') as file:
-                file.write(html_content)
-                print(f"[Hilo C] Guardando contenido de {filename} en {save_path}")
-
-# Hilo D: Guarda los enlaces extraídos en un archivo de texto para análisis
-def hilo_d():
-    while True:
-        if not link_queue.empty():
+        # Obtener un enlace de la cola
        link = link_queue.get()
-            with open("EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/extracted_links.txt", 'a', encoding='utf-8') as file:
-                file.write(link + "\n")
-            print(f"[Hilo D] Enlace guardado: {link}")
        
-thread_a = threading.Thread(target=hilo_a, daemon=True)
-thread_b = threading.Thread(target=hilo_b, daemon=True)
-thread_c = threading.Thread(target=hilo_c, daemon=True)
-thread_d = threading.Thread(target=hilo_d, daemon=True)
+        # Si obtenemos None, significa que el procesamiento ha terminado
+        if link is None:
+            break

-thread_a.start()
-thread_b.start()
-thread_c.start()
-thread_d.start()
+        # Marcar el enlace como procesado si no lo ha sido ya
+        with lock:
+            if link in processed_links:
+                link_queue.task_done()
+                continue
+            processed_links.add(link)

-try:
-    while True:
-        time.sleep(1)
-except KeyboardInterrupt:
-    print("Programa terminado")
+        print(f"[Hilo] Procesando enlace: {link}")
+
+        # Combinar `base_path` con el nombre del archivo para obtener la ruta completa
+        full_path = os.path.join(base_path, link)
+        
+        # Leer el contenido del archivo HTML
+        content = reader.read_file(link)
+        if content:
+            # Extraer los enlaces internos de este archivo
+            new_links = reader.extract_links(content)
+            
+            # Guardar los enlaces extraídos
+            guardar_enlaces(new_links)
+            
+            # Colocar nuevos enlaces en la cola si aún no han sido procesados
+            for new_link in new_links:
+                # Normalizar solo el nombre del archivo y agregar a la cola
+                normalized_link = os.path.normpath(new_link)
+                
+                with lock:
+                    if normalized_link not in processed_links:
+                        link_queue.put(normalized_link)
+
+        # Marcar este enlace como procesado en la cola
+        link_queue.task_done()
+
+# Enlace inicial para comenzar el procesamiento (solo nombre de archivo)
+initial_link = "index.html"
+link_queue.put(initial_link)
+
+# Crear y lanzar los hilos para procesar los enlaces
+num_threads = 4  # Ajusta el número de hilos según el sistema
+threads = []
+for _ in range(num_threads):
+    thread = threading.Thread(target=procesar_enlace)
+    thread.start()
+    threads.append(thread)
+
+# Esperar a que todos los enlaces en la cola se procesen
+link_queue.join()
+
+# Detener los hilos después de terminar el procesamiento
+for _ in threads:
+    link_queue.put(None)  # Insertar `None` para detener cada hilo
+
+for thread in threads:
+    thread.join()
+
+print("Todos los enlaces han sido procesados.")
--- a/Ejercicio04_WebScrapper/PruebaMain.py
+++ b/Ejercicio04_WebScrapper/PruebaMain.py
@ -0,0 +1,69 @@
+import os
+import threading
+import queue
+import time
+from WebFileReader import WebFileReader
+
+print("Directorio actual:", os.getcwd())
+
+# Inicializa colas para la comunciación entre hilos
+data_queue = queue.Queue()
+link_queue = queue.Queue()
+
+# Instancia de WebFileReader
+reader = WebFileReader()
+
+# Hilo A: Lee el archivos HTML y los coloca en data_queue
+def hilo_a():
+    filenames = ["index.html", "1.html", "2.html"]
+    for filename in filenames:
+        print(f"[Hilo A] Leyendo archivo: {filename}")
+        content = reader.read_file(filename)
+        if content:
+            data_queue.put((filename, content))
+        time.sleep(1) #Simulacion del tiempo de espera
+
+# Hilo B: Extrae los enlaces del contenido HTML y los coloca en link_queue
+def hilo_b():
+    while True:
+        if not data_queue.empty():
+            filename, html_content = data_queue.get()
+            print(f"[Hilo B] Extrayendo enlaces de: {filename}")
+            links = reader.extract_links(html_content)
+            for link in links:
+                link_queue.put(link)
+
+# Hilo C: Guarda el contenido de los archivos en archivos de texto en la carpeta resources
+def hilo_c():
+    while True:
+        if not data_queue.empty():
+            filename, html_content = data_queue.get()
+            save_path = f"EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/{filename.replace('.html', '')}_content.txt"
+            with open(save_path, 'w', encoding='utf-8') as file:
+                file.write(html_content)
+                print(f"[Hilo C] Guardando contenido de {filename} en {save_path}")
+
+# Hilo D: Guarda los enlaces extraídos en un archivo de texto para análisis
+def hilo_d():
+    while True:
+        if not link_queue.empty():
+            link = link_queue.get()
+            with open("EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/extracted_links.txt", 'a', encoding='utf-8') as file:
+                file.write(link + "\n")
+            print(f"[Hilo D] Enlace guardado: {link}")
+
+thread_a = threading.Thread(target=hilo_a, daemon=True)
+thread_b = threading.Thread(target=hilo_b, daemon=True)
+thread_c = threading.Thread(target=hilo_c, daemon=True)
+thread_d = threading.Thread(target=hilo_d, daemon=True)
+
+thread_a.start()
+thread_b.start()
+thread_c.start()
+thread_d.start()
+
+try:
+    while True:
+        time.sleep(1)
+except KeyboardInterrupt:
+    print("Programa terminado")
--- a/Ejercicio04_WebScrapper/WebFileReader.py
+++ b/Ejercicio04_WebScrapper/WebFileReader.py
@ -27,5 +27,3 @@ class WebFileReader:
        soup = BeautifulSoup(html_content, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', href=True)]
        return links
-
-
--- a/Ejercicio04_WebScrapper/pycache/WebFileReader.cpython-312.pyc
+++ b/Ejercicio04_WebScrapper/pycache/WebFileReader.cpython-312.pyc
--- a/Ejercicio04_WebScrapper/resources/input_html/1.html
+++ b/Ejercicio04_WebScrapper/resources/input_html/1.html
@ -3,9 +3,9 @@
        <meta charset="UTF-8">
        <title>Página 1</title>
    </head>
-    <BODY>
+    <body>
        <a href="2.html">Ir a página 2</a>
        <br>
        <a href="index.html">Volver</a>
-    </BODY>
+    </body>
 </html>
--- a/Ejercicio04_WebScrapper/resources/input_html/2.html
+++ b/Ejercicio04_WebScrapper/resources/input_html/2.html
@ -3,9 +3,11 @@
        <meta charset="UTF-8">
        <title>Página 2</title>
    </head>
-    <BODY>
+    <body>
        <a href="1.html">Ir a página 1</a>
        <br>
        <a href="index.html">Volver</a>
-    </BODY>
+
+        <a href="prueba"></a>
+    </body>
 </html>
--- a/Ejercicio04_WebScrapper/resources/input_html/index.html
+++ b/Ejercicio04_WebScrapper/resources/input_html/index.html
@ -3,7 +3,7 @@
        <meta charset="UTF-8">
        <title>Inicio</title>
    </head>
-    <BODY>
+    <body>
        <a href="1.html">Ir a página 1</a>
-    </BODY>
+    </body>
 </html>
--- a/Ejercicio04_WebScrapper/resources/output/1_content.txt
+++ b/Ejercicio04_WebScrapper/resources/output/1_content.txt
@ -1,11 +0,0 @@
-<html lang="es">
-    <head>
-        <meta charset="UTF-8">
-        <title>Página 1</title>
-    </head>
-    <BODY>
-        <a href="2.html">Ir a página 2</a>
-        <br>
-        <a href="index.html">Volver</a>
-    </BODY>
-</html>
--- a/Ejercicio04_WebScrapper/resources/output/extracted_links.txt
+++ b/Ejercicio04_WebScrapper/resources/output/extracted_links.txt
@ -1,7 +1,16 @@
+1.html
 2.html
 index.html
 1.html
 index.html
 1.html
+2.html
+index.html
 1.html
 index.html
+1.html
+2.html
+index.html
+1.html
+index.html
+prueba
--- a/Ejercicio04_WebScrapper/resources/output/index_content.txt
+++ b/Ejercicio04_WebScrapper/resources/output/index_content.txt
@ -1,9 +0,0 @@
-<html lang="es">
-    <head>
-        <meta charset="UTF-8">
-        <title>Inicio</title>
-    </head>
-    <BODY>
-        <a href="1.html">Ir a página 1</a>
-    </BODY>
-</html>