From 55b994f38e03ff8fefdfcb7cc6f13f2fdeaeb631 Mon Sep 17 00:00:00 2001 From: Dennis Eckerskorn Date: Sun, 8 Dec 2024 20:29:07 +0100 Subject: [PATCH] Trying to make scrapper work on diferent threads, works but to fast and app sometimes crashes --- .../__pycache__/scrapper.cpython-313.pyc | Bin 6617 -> 7828 bytes src/services/scrapper.py | 82 +++++++++++++----- .../centered_window.cpython-313.pyc | Bin 13139 -> 13139 bytes 3 files changed, 59 insertions(+), 23 deletions(-) diff --git a/src/services/__pycache__/scrapper.cpython-313.pyc b/src/services/__pycache__/scrapper.cpython-313.pyc index 6a04ec193440d808d27293863b71426c420ad645..a37199b14e4816ce7678981f4abffd49547cd288 100644 GIT binary patch delta 3127 zcmai0Z){W76~E8V@A==$^PjxLc5KHlwjpr}2_ygNkPs4*09zr##)B;_!HHi&iW7(X zY^kFaDPO8}ng-eJ#+VwyRng;N_iJVA9HM>u_Km3Df8E?U$tSI zjC?49Qd}KM@f{{AiV(I>F>%X6A!3U&I(M|F6qSV^DvW*M3qNz7)G}vlN1bI!ggQ2v zG|E}YJmWuLtE|+cQK`ZwYb)H=nDlM&6%I~f--XgB=^=k%{Qf{?K4tA-%Djxyreuh? zO%2ucl~oJMfUQ|}G(d(-_ZSxmu>Ee#!Ln}8P0yYgJ4>%Xo@Wm-5_y+B&pTjAeV4?| z&9=3=@SBa_HhKjJ+%Kk}fh1(h`DRaVP=!63ND7}pF zrMN9mO!09G`4N|hU1N2=kik<*K{vyo>U=h%%;aa|Cj1O&;hjK;#x+k`u@_j~36f() zoEw|O`I)ieG?u9pjfc)3JB^i0R(GfqI5Ve=&1I$(om0VjLHA70stPWQQJ<<~h5T4{ z9CnCOH5RO?G<{?lZzl$&y7J%I8+iwKCg3nSR2MKk|L#o~@ev$z!c*f(_`G-Gji_ zuX<`sNgMSrdBl4wLhi>xaJIvSAOyvf6u`sPBy5owC%NE~>?uCM7@6fZ78Hw%sW!OR zEh30e;h$Ts*4qvDmAyrCQ|FYKxx%+}7W|1NGG>kW%HAks#&H4!R2GPA_~cD%^EYW6 z)L#I36@BUrLe>du)~aQ(YsLEPI*+Qoms=NFOP)r}(^&GvHBWq1a9tdE9lR9Qgz$>c zu+Ba$^sy(tZbfjs-?+O;*cJ_Rg7t|_90Q9G#Unk%!yI|Rw$6M;j@WN zA$N_uC&X=sfy2!}$OEAzCY3`(>kY00qO+N7R<~jWYJgf$@jfcJziM|L8v$hJiUlL_ zaVyo>Mafqvktv}$9Jd-7j@y8zcGS5%pg=dP`QjXYj!tq09&|Gu@HUuH>ER&i#)s(@ z@GkP8%Fp-F#Z$y3_CR_a7IWFpf(@4jR|DZIJy&`@@r70;-{p~ok&@J?NsWuui;nwJ z`vVIKMIPFLJSn**g-TNFt`u7wEbZvgcJ$mBywSCsI<844O43P9I(c6@wLvu>*}w0s zcg2zD{oXge*btHb1?KZdLDJ%k5ATQ5{4zMy#ony$uNoH7Ejc!9Gk;|6kHDYbHMRl0 z9SBg`Wgq4(cWfN+cSQSe8+*s+9&Tapv~a_1*WYrUW@?_Lm!uhwyD8a2NiXrc<1g+7 z_DGz=v=`uhN(LyQo8v(saem9c_&A*%qQtOu0yveH*(Y6@g8#ZFMa$Eif?JAn*-Sx! zBKACa&;5g-k^h^$0t6AVz|YA4&rRf{Ey8AHM1E`Yk)OJZGUBXMhyzSEhEP4p*uqcC zh!vR(7@o3`4{gl?)O1sVDFV{wJcPH`)0*-T)5fuBHtB}MuH$FTX>)S3ig1p{f2z$V z8URF_5QU<{CTl1VsDQW3uw^DX>NjoxB*mpoB6R391Ju&C)dS$TvK6j(fH};3>0a}d zGiS@IQF`(z8$gPp!uU4ZCilcP)-)%gwkIGSMk(7Y*UueEm&ZwF5|v>Zcbw}uWb}iK zoXe}SGMmdxD0Gx_8F`$9CBY|W6*a5S{}cIK=C3p?_z)2Cj5JQ(mpbO%Lxz?-Rm|XQ zCY$e;&)MasxeQij3*EBL#LsOX!Z?rROm0&KS2oC~3YeE6_<6NkK7y;kZd@#*pF$f` zuo-a2Ay*w<6tOy=$GSy%Ntq}X6qw1+%uE+>1C@~}G2mZLPu*TV6z!Ur zrX_}CLHHCU2IQa4(%rD42OekXe|OFoL112uf={3sh@OkNi@CRBS6km{Ek(MtNLMM6 zxEo2Vo5ZR?=2LIr@++5LSv3F3dCggB>eZThOHF-RQ{SrOUyU|i?R%%M6z$Zaol9qz zqn*pqe$6Fcvaf|ASH^xkRtmM>4Yg~b;r9!_KL5daU@qBL{n3)YS@X{~f9&6JiC=Zq zUKSU`b--D_=@CNi0n_Hxa`@Qt@zLe4pI&iIue$0=t_IE3u;Pl+Wnn>hLo_C3*l)!Z zS##Aa`{IkS#oEPlOU+Bc8%LLG53INjt_Pq&(ffVp=^H+GKazlIgrv|z#K1)j3wxK& zE=?@g4wPK`HP`-st+RCHpDK;t^<6_=bTc-v2Zq~T`%r?t-5?INv$xy1p~UsqYHQ$; zM&9xU+V)ZJi~WTHo*swT8aZUdbo0G}r$FHGdj)vhLjL9r!RyxQ8xQhEiWm*T=pp5# zG74BNx!~&wH&P8~eh3d!Vss5!mN1Pvm|F7%Mrl4`T8+vLfR-Ul34M;M)WBViBTwZocVcWrOnsEM7< z+HmRxQXvFaQ5_czTu%sziZ2u)P^J4)2@qULfsYBK=dewslTIMrw~!Xm4=2H!b=;PY zw7-2b^Jd=6d%ySQhhu*qRv!8NJ_PN*PPQzyFDn`|wVWE&G}MC(eit%?QI<|4ggs>@ zEiPZ}S^lNPcu70c&0HXRn9F?zYj6f{2!?1#`-Lo;lbQ<OSaEmd0Xg^{y@$vbgfT`Z z*MvBeC8KgOTIO?ts{~+h&kk4S(*k)*JW4ji!Iya)hG{2Iwrcr0p3BeR>U_RFi{o^( z%`Red!OAa~Wh>3%Bpu^G%cj26b=*rXN_#~eI`%mEv9wRwkky79U6Z5l8~5cT@p^jM zeiudP!$vu~Q3>+li^Yv;? z>ov20hp0h%Yu!D=UEg4AkPg`ZSJB5^(YN{Q@}|eXsvKVT9N7}dskRG|eK1+v;Q)vT zdEs+i7bm}O+hTr8Zp$BX2;!FU?-FotCKj&hl# z8}vEF&+H~w6eHyy;}U3wu~{A2Y=xgeNtEktEeAS8fDTe_H+ifS#7}Ab)8u?09i5;O z=ZG~ELtGplezqf)Ol!VcA@2nC_XeBZ34tE&25>mDRB}Anx?H#Hn(c6~jC>q8xa6p| z`E@HlH(R-6=WEq`sembA9r3k`*fL8vO*?`OhH6b;>oAb^xS__>sdQ6&p<1ymhqtTs z1w8Jqa5r>#`op_n#P$F{INXNm72U1LchsVZ$9$#+j}bVY6~-dJ2-+|-f{ZQ!rkz-_)UbYgAj#9#P7d}HvBTzZzJ1ukf4z!<^xIan4Y59ZCcU)vbFVU+|!r=oH4pJB) ze-5UXMnRlT3phxJ4pBHvf!-D$1xSm}yofV&^cV$qF9Rf71bE(IXQA(O7&PS;EWE$I zP%>*4G&3ORpkxxr=y1&U0T z0py2;=d^crhzMVPb%bpdVI7?Va-m=zT?%sw-gK!#Ki$HlVL{Le5msF{Ij@0I>DbrY9JKueM-t&r?+yJn?S)vd~$ zS+tJnSA6=wf{Cq4?U?Q`|Q73=1lUN-AQ>5eYZ1EqZ!G8f?}g%|}|CLFO? z$95GvlJ%xltk)`XX)rU0H-_k zlpil9_Ia4Me*d{cEBAzl8?gKD{A#{9yXp^lD^rUH;}LTI^kZ zQw^;|m_6RZF6U?SeKI3;q;Kf7Vo=8vZMJs!KClzkx zkaE(?-SrA5l@C&p5Cf0jSTr&`MQ^8!*J^mS0IqZ0?)q=%-N6??;pw{rG9Dn`kH(mJ z@{4GpOK=0=mcnLbbdxefQnBMpeN=f5h0jr-6%W%?a~+C-$jAxh^H2h7v&FpGI)I(e zV;X8qQ>+;PnlG3FeL3tDz<1F$+bNu2w)c93quZUbFti;F3&Yz(K9IBk%>272ED2h3 z6S_3*b?YxaL!QJAW?d4w+y1{P$8&tXTB^@kFXKzF2vQl@I=~jkFw6s_K0twoDE1KP S4^a3&o=Xg)Z6k`Vo&N#yy!zn) diff --git a/src/services/scrapper.py b/src/services/scrapper.py index a1594e0..3c477de 100644 --- a/src/services/scrapper.py +++ b/src/services/scrapper.py @@ -3,6 +3,7 @@ import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import mysql.connector +from queue import Queue #http://books.toscrape.com/ test scrap web @@ -12,6 +13,7 @@ class Scrapper: self.visited_links = set() self.running=False self.lock = threading.Lock() + self.link_queue = Queue() #Configurar la base de datos para los enlaces self.db_config = { @@ -34,8 +36,9 @@ class Scrapper: self.running = True url = self.get_url_from_ui() if url: - print(f"Iniciando scraping en: {url}") - self.scrape_page(url) + print(f"Iniciando scraping en: {url}") + threading.Thread(target=self.scrape_page, args=(url,), daemon=True).start() + threading.Thread(target=self.insert_links_to_db, daemon=True).start() else: print("No se proporcionó una URL válida.") @@ -44,6 +47,10 @@ class Scrapper: self.running = False print("Scrapping detenido. Proceso finalizado.") + #Vaciar la cola para detener el hilo de inserción + while not self.link_queue.empty(): + self.link_queue.get() + # Actualiza la pestaña "Scrapping" con un mensaje tab = self.ui_instance.tabs["Scrapping"] text_widget = tab["text_widget"] @@ -67,30 +74,66 @@ class Scrapper: soup = BeautifulSoup(response.text, "html.parser") links = [urljoin(url, a.get("href")) for a in soup.find_all("a", href=True)] self.update_ui(url, links) - self.save_links_to_db(url, links) for link in links: - if self.running: - threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start() + if not self.running: + break + self.link_queue.put((url, link)) + + for link in links: + if not self.running: + break + threading.Thread(target=self.scrape_page, args=(link,), daemon=True).start() else: print(f"Error al acceder a {url}: {response.status_code}") except Exception as e: print(f"Error al scrapear {url}: {e}") - def update_ui(self, url, links): - """Actualiza la pestaña 'Scrapping' con los enlaces encontrados""" - tab = self.ui_instance.tabs["Scrapping"] - text_widget = tab["text_widget"] + def update_ui(self, url, links): + """Actualiza la pestaña 'Scrapping' con los enlaces encontrados""" + tab = self.ui_instance.tabs["Scrapping"] + text_widget = tab["text_widget"] - text_widget.configure(state="normal") - text_widget.insert("end", f"Enlaces encontrados en {url}:\n") - for link in links: + text_widget.configure(state="normal") + text_widget.insert("end", f"Enlaces encontrados en {url}:\n") + for link in links: text_widget.insert("end", f" - {link}\n") - text_widget.see("end") + text_widget.see("end") text_widget.configure(state="disabled") + + + def insert_links_to_db(self): + """Inserta los enlaces en la base de datos desde la cola""" + while self.running or not self.link_queue.empty(): + try: + # Obtener un enlace de la cola + if not self.running and self.link_queue.empty(): + break + parent_url, link = self.link_queue.get(timeout=1) # Espera 1 segundo si la cola está vacía + connection = mysql.connector.connect(**self.db_config) + cursor = connection.cursor() + cursor.execute("CREATE TABLE IF NOT EXISTS links (id INT AUTO_INCREMENT PRIMARY KEY, url TEXT, parent_url TEXT)") + cursor.execute("INSERT INTO links (url, parent_url) VALUES (%s, %s)", (link, parent_url)) + connection.commit() + cursor.close() + connection.close() + print(f"Enlace guardado: {link} (parent: {parent_url})") + except Exception as e: + print(f"Error al guardar en la base de datos: {e}") + + + def get_url_from_ui(self): + """Obtiene la URL desde la interfaz de usuario""" + try: + url_entry = self.ui_instance.left_panel.url_entry + return url_entry.get() + except AttributeError: + print("No se pudo obtener la URL desde la interfaz") + return None +""" def save_links_to_db(self, url, links): - """Guarda los enlaces en la base de datos""" + Guarda los enlaces en la base de datos try: connection = mysql.connector.connect(**self.db_config) cursor = connection.cursor() @@ -105,13 +148,6 @@ class Scrapper: connection.close() except Exception as e: print(f"Error al gaurdar en la base de datos: {e}") - - def get_url_from_ui(self): - """Obtiene la URL desde la interfaz de usuario""" - try: - url_entry = self.ui_instance.left_panel.url_entry - return url_entry.get() - except AttributeError: - print("No se pudo obtener la URL desde la interfaz") - return None +""" + \ No newline at end of file diff --git a/src/ui/__pycache__/centered_window.cpython-313.pyc b/src/ui/__pycache__/centered_window.cpython-313.pyc index 3ba83e69a8f8cc761e234ac4cddee96fe31e7e41..9421f4177f85fc1abbe54f0d5906e5cb2e828679 100644 GIT binary patch delta 19 Zcmcbdb~%mfGcPX}0}vekw2{lx7yw7n27mwn delta 19 Zcmcbdb~%mfGcPX}0}$xn-pJ)?3;;vs1=Rom