Updated Project, reads files and extracts links, but problems with threads

2024-10-31 12:16:05 +01:00 · 2024-10-31 12:16:05 +01:00 · 220045b92b
parent 902f347771
commit 220045b92b
9 changed files with 158 additions and 1 deletions
--- a/Ejercicio04_WebScrapper/Main.py
+++ b/Ejercicio04_WebScrapper/Main.py
@ -0,0 +1,69 @@
 import os
 import threading
 import queue
 import time
 from WebFileReader import WebFileReader
 print("Directorio actual:", os.getcwd())
 # Inicializa colas para la comunciación entre hilos
 data_queue = queue.Queue()
 link_queue = queue.Queue()
 # Instancia de WebFileReader
 reader = WebFileReader()
 # Hilo A: Lee el archivos HTML y los coloca en data_queue
 def hilo_a():
    filenames = ["index.html", "1.html", "2.html"]
    for filename in filenames:
        print(f"[Hilo A] Leyendo archivo: {filename}")
        content = reader.read_file(filename)
        if content:
            data_queue.put((filename, content))
        time.sleep(1) #Simulacion del tiempo de espera
 # Hilo B: Extrae los enlaces del contenido HTML y los coloca en link_queue
 def hilo_b():
    while True:
        if not data_queue.empty():
            filename, html_content = data_queue.get()
            print(f"[Hilo B] Extrayendo enlaces de: {filename}")
            links = reader.extract_links(html_content)
            for link in links:
                link_queue.put(link)
 # Hilo C: Guarda el contenido de los archivos en archivos de texto en la carpeta resources
 def hilo_c():
    while True:
        if not data_queue.empty():
            filename, html_content = data_queue.get()
            save_path = f"EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/{filename.replace('.html', '')}_content.txt"
            with open(save_path, 'w', encoding='utf-8') as file:
                file.write(html_content)
                print(f"[Hilo C] Guardando contenido de {filename} en {save_path}")
 # Hilo D: Guarda los enlaces extraídos en un archivo de texto para análisis
 def hilo_d():
    while True:
        if not link_queue.empty():
            link = link_queue.get()
            with open("EjerciciosConHilos/Ejercicio04_WebScrapper/resources/output/extracted_links.txt", 'a', encoding='utf-8') as file:
                file.write(link + "\n")
            print(f"[Hilo D] Enlace guardado: {link}")
 thread_a = threading.Thread(target=hilo_a, daemon=True)
 thread_b = threading.Thread(target=hilo_b, daemon=True)
 thread_c = threading.Thread(target=hilo_c, daemon=True)
 thread_d = threading.Thread(target=hilo_d, daemon=True)
 thread_a.start()
 thread_b.start()
 thread_c.start()
 thread_d.start()
 try:
    while True:
        time.sleep(1)
 except KeyboardInterrupt:
    print("Programa terminado")
--- a/Ejercicio04_WebScrapper/WebFileReader.py
+++ b/Ejercicio04_WebScrapper/WebFileReader.py
@ -1 +1,31 @@
-import mysql.connector
+import os
 from bs4 import BeautifulSoup
 class WebFileReader:
    def __init__(self, base_path="EjerciciosConHilos/Ejercicio04_WebScrapper/resources/input_html"):
        self.base_path = base_path
    def read_file(self, filename):
        """
        Lee un archivo HTML y devuelve su contenido como texto.
        """
        filepath = os.path.join(self.base_path, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as file:
                return file.read()
        except FileNotFoundError:
            print(f"[WebFileReader] Archivo no encontrado: {filepath}")
            return None
        except Exception as e:
            print(f"[WebFileReader] Error al leer el archivo {filepath}: {str(e)}")
            return None
    def extract_links(self, html_content):
        """
        Extrae todos los enlaces de un contenido HTML dado.
        """
        soup = BeautifulSoup(html_content, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', href=True)]
        return links
--- a/Ejercicio04_WebScrapper/pycache/WebFileReader.cpython-312.pyc
+++ b/Ejercicio04_WebScrapper/pycache/WebFileReader.cpython-312.pyc
--- a/Ejercicio04_WebScrapper/resources/input_html/1.html
+++ b/Ejercicio04_WebScrapper/resources/input_html/1.html
@ -0,0 +1,11 @@
 <html lang="es">
    <head>
        <meta charset="UTF-8">
        <title>Página 1</title>
    </head>
    <BODY>
        <a href="2.html">Ir a página 2</a>
        <br>
        <a href="index.html">Volver</a>
    </BODY>
 </html>
--- a/Ejercicio04_WebScrapper/resources/input_html/2.html
+++ b/Ejercicio04_WebScrapper/resources/input_html/2.html
@ -0,0 +1,11 @@
 <html lang="es">
    <head>
        <meta charset="UTF-8">
        <title>Página 2</title>
    </head>
    <BODY>
        <a href="1.html">Ir a página 1</a>
        <br>
        <a href="index.html">Volver</a>
    </BODY>
 </html>
--- a/Ejercicio04_WebScrapper/resources/input_html/index.html
+++ b/Ejercicio04_WebScrapper/resources/input_html/index.html
@ -0,0 +1,9 @@
 <html lang="es">
    <head>
        <meta charset="UTF-8">
        <title>Inicio</title>
    </head>
    <BODY>
        <a href="1.html">Ir a página 1</a>
    </BODY>
 </html>
--- a/Ejercicio04_WebScrapper/resources/output/1_content.txt
+++ b/Ejercicio04_WebScrapper/resources/output/1_content.txt
@ -0,0 +1,11 @@
 <html lang="es">
    <head>
        <meta charset="UTF-8">
        <title>Página 1</title>
    </head>
    <BODY>
        <a href="2.html">Ir a página 2</a>
        <br>
        <a href="index.html">Volver</a>
    </BODY>
 </html>
--- a/Ejercicio04_WebScrapper/resources/output/extracted_links.txt
+++ b/Ejercicio04_WebScrapper/resources/output/extracted_links.txt
@ -0,0 +1,7 @@
 2.html
 index.html
 1.html
 index.html
 1.html
 1.html
 index.html
--- a/Ejercicio04_WebScrapper/resources/output/index_content.txt
+++ b/Ejercicio04_WebScrapper/resources/output/index_content.txt
@ -0,0 +1,9 @@
 <html lang="es">
    <head>
        <meta charset="UTF-8">
        <title>Inicio</title>
    </head>
    <BODY>
        <a href="1.html">Ir a página 1</a>
    </BODY>
 </html>