
    (#hD                        d Z 	 ddlZdZddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC  ej                  eE      ZF ej                  d       ej                  ej                  z   ZJ ej                  d      ZL ej                  d      ZM ej                  d      ZNg dZOddddd ZPd!ed"eeQ   fd#ZRd$eQd"eeQ   fd%ZSd!ed"e"fd&ZTd'eQd"eUfd(ZVd)eQd*eWd"eQfd+ZXd,eQd-eQd"e eQeQf   fd.ZYd/eQd"eQfd0ZZ	 	 dMd!ed1eQd/eQd2eWd3eeQ   d"e eQeQf   fd4Z[dNd5eQd!ed2eWd"eQfd6Z\	 	 	 dOd7eeQ   d!ed1eQd2eWd3eeQ   d"dfd8Z]d9eQd"eeQddf   fd:Z^	 dMd$eQd!ed2eWd;ee8   d"df
d<Z_d5eQd!ed2eWd;ee8   d"eWf
d=Z`d>e"d!ed2eWd;e8d"e eeQ   eWf   f
d?Zad!ed"eWfd@Zbd>e"dAeeQ   d!ed"e"fdBZc	 	 	 dPd!edCeWd>ee"   d;ee8   d"df
dDZdd!ed"dfdEZedFeeQ   dGeWd"eWfdHZfd!ed>e"d"eWfdIZgd!ed"dfdJZh	 	 dQd5ee   d!edKeeQ   d;ee8   d"eeQ   f
dLZiy# e$ r dZY w xY w)Rz1
Functions dedicated to command-line processing.
    NTF)urlsafe_b64encode)ProcessPoolExecutorThreadPoolExecutoras_completed)datetime)partial)makedirspathstatwalk)RLock)Any	GeneratorOptionalListSetTuple)UrlStoreextract_domainget_base_url)spider   )html2txt)extract)generate_bow_hash)Responseadd_to_compressed_dictbuffered_downloadsbuffered_response_downloadsload_download_buffer)find_feed_urls)reset_caches)	ExtractorFILENAME_LENMAX_FILES_PER_DIRECTORYargs_to_extractor)sitemap_search)LANGID_FLAGURL_BLACKLIST_REGEXis_acceptable_lengthlanguage_classifiermake_chunksiY  z[^/]+$z\.[a-z]{2,5}$z<[^<]+?>)URLcrawlexploreprobefeedsitemapz.csvz.jsonz.xml)csvjsonxmlxmlteiargsreturnc                    g }| j                   r;	 t        | j                   dd      5 }|j                  d |D               ddd       n&t        D ]  }t        | |      st        | |      g} n |st        j                  d       t        t        j                  |            S # 1 sw Y   exY w# t        $ r t	        j
                  d       Y aw xY w)zGRead list of URLs to process or derive one from command-line arguments.rutf-8modeencodingc              3   <   K   | ]  }|j                           y wN)strip).0lines     R/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/cli_utils.py	<genexpr>z"load_input_urls.<locals>.<genexpr>V   s     !E4$**,!E   Nz+ERROR: system, file type or buffer encodingzNo input provided)
input_fileopenextendUnicodeDecodeErrorsysexitINPUT_URLS_ARGSgetattrLOGGERwarninglistdictfromkeys)r7   
input_urls	inputfileargs       rD   load_input_urlsrW   N   s    J	DdooC'B Fi!!!E9!EEF
 # 	CtS!%dC01
	
 *+ j)**F F! 	DHHBC	Ds(   B1 B%B1 %B.*B1 1CCfilenamec           	          t        | dd      5 }|D ch c]&  }t        j                  d|j                               ( }}ddd       |S c c}w # 1 sw Y   S xY w)zRead list of unwanted URLs.r:   r;   )r>    N)rH   r)   subrA   )rX   inputfhrC   	blacklists       rD   load_blacklistr^   f   s`    	hg	. T'KRS4(,,R>S	ST  TT s   A+AAAAc                     t        |       }t        || j                  | j                  xr | j                   | j
                  | j                        S )zGRead input list of URLs to process and build a domain-aware dictionary.)r]   compression
url_filterverbose)rW   r   r]   r2   rQ   ra   rb   )r7   	inputlists     rD   load_input_dictrd   n   sC    %I!..\\3$))m??     	directoryc                     t        j                  |       rt        j                  |       s	 t        | d       yy# t        $ r( t
        j                  j                  d| z   dz          Y yw xY w)z;Check if the output directory is within reach and writable.T)exist_okz0ERROR: Destination directory cannot be created: 
F)r
   existsisdirr	   OSErrorrK   stderrwrite)rf   s    rD   check_outputdir_statusro   {   sg     ;;y!I)>
	Y.   	 JJBYNQUU 	s   ; .A,+A,dirnamecc                 x    |dk\  rt        t        |t        z        dz         nd}t        j                  | |      S )z7Return a destination directory based on a file counter.r   r   rZ   )strintr%   r
   join)rp   rq   c_dirs      rD   determine_counter_dirrw      s5    9:aCA//0145RE99We$$re   destdir	extensionc                     d}|t        j                  |      rWdj                  d t        t              D              }t        j                  | ||z         }|At        j                  |      rW|fS )zCFind a writable path and return it along with its random file name.NrZ   c              3   N   K   | ]  }t        j                  t                y wr@   )randomchoice
CHAR_CLASS)rB   _s     rD   rE   z$get_writable_path.<locals>.<genexpr>   s     R6==4Rs   #%)r
   rj   ru   ranger$   )rx   ry   output_pathrX   s       rD   get_writable_pathr      sf    K

[!977ReL>QRRiiI)=> 
[!9   re   contentc                 p    t        t        t        j                  d|       d            j	                         S )zaCreate a filename-safe string by hashing the given content
    after deleting potential XML tags.rZ      )r   r   	CLEAN_XMLr[   decode)r   s    rD   generate_hash_filenamer      s+     .y}}R/I2NOVVXXre   orig_filenamecounternew_filenamec                 x   t         j                  | j                  d      }| j                  rMt        j                  d|      }t        j                  | j                  |      }t        j                  d|      }n%t        | j                  |      }|xs t        |      }t        j                  |||z         }	|	|fS )zPPick a directory based on selected options and a file name based on output type.z.txtrZ   )EXTENSION_MAPPINGgetoutput_format	keep_dirs	STRIP_DIRr[   r
   ru   
output_dirSTRIP_EXTENSIONrw   r   )
r7   r   r   r   r   ry   original_dirdestination_dirrX   r   s
             rD   determine_output_pathr      s     "%%d&8&8&AI~~ }}R7))DOO\B"&&r=9/IB#9'#B))OX	-ABK''re   
htmlstringc                    t        |j                  |      }t        |d      \  }}t        |      du rGt        rAt        j                  |d      5 }|j                  | j                  d             ddd       |S |S # 1 sw Y   |S xY w)z-Write a copy of raw HTML in backup directory.z.html.gzTwbr;   N)	rw   
backup_dirr   ro   HAS_GZIPgziprH   rn   encode)r   r7   r   destination_directoryr   rX   
outputfiles          rD   archive_htmlr      s|    1$//7K-.CZPK34<YY{D) 	9ZZ..w78	9O8O	9Os   !A<<Bresultc                    | y|j                   #t        j                  j                  | dz          yt	        ||| ||      \  }}t        |      du r)t        |dd      5 }|j                  |        ddd       yy# 1 sw Y   yxY w)z-Deal with result (write to STDOUT or to file)Nri   Twr;   r<   )r   rK   stdoutrn   r   ro   rH   )r   r7   r   r   r   destination_pathr   r   s           rD   write_resultr      s     ~

$',A-,-
)/ "/2d:&S7C )z  () ) ;) )s    A<<Binputdirc              #   r   K   t        |       D ]%  \  }}}|D ]  }t        j                  ||        ' yw)z2Walk the directory tree and output all file names.N)r   r
   ru   )r   rootr   
inputfilesfnames        rD   generate_filelistr      s@     #H~ )a 	)E))D%((	))s   57optionsc                    |st        |      }| |_        t        | d      5 }|j                         }ddd       t	        |       }t        |j                  |j                        }t        j                  |      j                  d      |j                  d<   t        ||      }t        ||| |d       y# 1 sw Y   xY w)z1Aggregated functions to process a file in a list.rbNz%Y-%m-%dmax_dater   )r   )r&   sourcerH   readr   minst_ctimest_mtimer   fromtimestampstrftimedate_paramsexaminer   )	rX   r7   r   r   inputfr   	file_statref_timestampr   s	            rD   file_processingr      s     #D)GN	h	 #[[]
# XI	**I,>,>?M&.&<&<]&K&T&T'G
# Zw7FxtD# #s   B55B>c                     |j                   rt        | ||      nd}t        | ||      }t        |||||       |dk\  r|r|dz  }|S )zVExtract text and metadata from a download webpage and eventually write out the result.rZ   r   )r   r   r   r   r   )r   r   r   r   )r   r7   r   r   fileslugr   s         rD   process_resultr      sT    
 ;?//|Jg6rHZw7FHgH !|1Nre   	url_storec                 t   g }|j                   j                  dd      }| j                  st        | |      \  }} t	        ||j
                  |      D ]T  \  }}|r&t        |t              r||_        t        ||||      }.t        j                  d|       |j                  |       V | j                  s||fS )z?Implement a download queue consumer, single- or multi-threaded.DEFAULT
SLEEP_TIMEr   zNo result for URL: %s)configgetfloatdoner    r   parallel
isinstancers   urlr   rO   rP   append)	r   r7   r   r   errors
sleep_time
bufferlistr   r   s	            rD   download_queue_processingr     s     F((LAJnn 4Y
 K
I-w
 		#KC *VS1!(wH6<c"		# nn 7?re   c           	        	
 t        |       }|j                         }| j                  r|j                          t	        |       }t        | j                  rt        nt        | j                  |j                  j                  dd      |j                  j                  dd            
t               }t        | j                        5 		
fd|D        }t!        |      D ]  }|j#                         |j%                  |j#                                | j                  s@t'        |j)                               | j                  k\  sg|5  |j+                          |j                          t-                ddd        	 ddd       t/        | |      }| j0                  rt3        |||       }t5        | ||       |S # 1 sw Y   xY w# 1 sw Y   JxY w)	z/Group CLI functions dedicated to URL discovery.r   EXTERNAL_URLSr   )target_langexternalr   max_workersc              3   B   K   | ]  }j                  |        y wr@   )submit)rB   r   executorfuncs     rD   rE   z cli_discovery.<locals>.<genexpr>5  s     D#8??4-Ds   N)r   r   )rd   	dump_urlsrQ   resetr&   r   r1   r!   r'   target_languager   
getbooleanr   r   r   r   r   r   add_urlslenget_known_domainsprint_unvisited_urlsr"   url_processing_pipeliner/   build_exploration_dictcli_crawler)r7   r   rT   r   lockfuturesfuture	exit_codecontrol_dictr   r   s            @@rD   cli_discoveryr   #  sy   %I$$&Jyy%G))((**9oF>>**9lC	D 7D 
	6 '(DD #7+ 	'F}}*""6==?399Y%@%@%B!Ct}}!T '!668!)$' '	'	' (i8I ||-iTJDL'B' '' 's6   +G4+G &GG	+F<4
G<GGGrT   c                 8   |D ch c]  }t        |       }}|| j                         D ch c]  }t        |       c}z
  }|D cg c]  }t        |      |v s| }}t        ||j                  |j                  |j
                        S c c}w c c}w c c}w )zMFind domains for which nothing has been found and add info to the crawl dict.)r]   ra   rb   )r   r   r   r]   ra   rb   )r   rT   r7   uinput_domainsstill_to_crawlnew_input_urlss          rD   r   r   N  s     1;;1^A&;M;"#,#>#>#@&q& N ",SA~a/@N/RaSNS!..??	  <& Ts   BBBBnc                 0   |xs t        |       }|j                  j                  dd      }i }|)t        j                  j                  t        |              n|t        _        t        j                  j                         D ]r  }t        j                  j                  |   j                  s+t        j                  j                  |d      }|sOt        j                  || j                        ||<   t t        j                  j                  st        t        j                  |      \  }t        _        t        || j                   |      D ];  \  }	}
|
s	t#        |
t$              st        j&                  |
|t)        |	                = t+        fdt        j                  j-                         D              rnt        j                  j                  st/        d	j1                  d
 t        j                  j3                         D                     y)z~Start a focused crawler which downloads a fixed number of URLs within a website
    and prints the links found in the process.r   r   NF)
as_visited)langr   c              3   (   K   | ]	  }|k\    y wr@    )rB   rq   r   s     rD   rE   zcli_crawler.<locals>.<genexpr>  s     A!qAvAs   ri   c              3       K   | ]  }|  y wr@   r   )rB   r   s     rD   rE   zcli_crawler.<locals>.<genexpr>  s     <!A<s   )r&   r   r   r   	URL_STOREr   rW   r   urldicttuplesget_url
init_crawlr   r   r    r   r   r   r   process_responser   anyget_all_countsprintru   r   )r7   r   r   r   r   
param_dicthostname	startpager   r   r   s    `         rD   r   r   _  s    0*40G((LAJJ !!/$"78$ $$668 ##H-44((00e0LI'-'8'8D$8$8(
8$	 ##';j(
$
F$ 7w
 	OKC *VX6''
<;L0MN		O Av//>>@AA ## 
$))<v//99;<
<=re   c                 \   t        |       }t        |       }t        || j                  |      D ]{  \  }}|	t	        |      }|st        |      |j                  kD  s0t        d |D              sCt        r&| j                  rt        |d      | j                  k(  sot        |d       } y)zBProbe websites for extractable content and print the fitting ones.r   Nc              3   <   K   | ]  }|j                           y wr@   )isalpha)rB   rq   s     rD   rE   z!probe_homepage.<locals>.<genexpr>  s     4		4rF   rZ   T)flush)rW   r&   r   r   r   r   min_extracted_sizer   r(   r   r+   r   )r7   rT   r   r   r   s        rD   probe_homepager	    s     &J%G)DMM7 +V f%FK'"<"<<4V44 $//*626$:N:NN#T*+re   r   totalc                 @    |dkD  rt        |       |z  nd}|dkD  ry| ryy)zvCompute exit code based on the number of errors:
    0 if there are no errors, 126 if there are too many, 1 otherwise.r   gGz?~   r   )r   )r   r
  ratios      rD   _define_exit_coder    s-     $)19CK%!Et|re   c                 f   | j                   r|j                          yt        |       }|j                         }|t        kD  rdnd}t        || ||      \  }}t        j                  dt        |      |       | j                  du rt               }|j                  |D cg c]  }d|z   	 c}       t        |j                  d            dkD  rTt        || ||      \  }}t        j                  dt        |      t        |             t        ||j                               S t        ||      S c c}w )	zKAggregated functions to show a list and download and process an input list.Fr   z%s / %s URLs could not be foundTzhttps://web.archive.org/web/20/zhttps://web.archive.orgz-%s archived URLs out of %s could not be found)rQ   r   r&   total_url_numberr%   r   rO   debugr   archivedr   r   find_known_urlsr  )	r7   r   r   	url_countr   r   earchived_errorsr   s	            rD   r   r     s   yy&&(%G**,I66aBG 0	4'ROFG
LL2CKK}}J	6Ra=ARSy(()BCDqH!:4'"OQ LL?O$F %_i6P6P6RSSVY// Ss   D.c                    d}t        |       }|j                  j                  dd      }t        | j                        5 }t        t        | j                        t              D ]W  }|dk  rt        |      t        k\  rd}t        t        | ||      }|j                  ||d|       |dk\  sJ|t        |      z  }Y 	 d	d	d	       y	# 1 sw Y   y	xY w)
zGDefine batches for parallel file processing and perform the extraction.r  r   EXTRACTION_TIMEOUTr   r   )r7   r   r   
   )	chunksizetimeoutN)r&   r   getintr   r   r,   r   	input_dirr%   r   r   r   map)r7   filecounterr   r  r   	filebatchworkers          rD   file_processing_pipeliner#    s    K%Gnn##I/CDG 
	7 .8$dnn-/F
 	.I Q3y>5L#LdKF LLb'LJas9~-	.. . .s    A-C.CCr   c                    d}|st        ||      }| !t        j                  j                  d       |S t	        t        |       |      s!t        j                  j                  d       |S 	 t        | |      }|S # t        $ rL}t        j                  j                  dt        |       dt        j                          d       Y d}~|S d}~ww xY w)z;Generic safeguards and triggers around extraction function.NzERROR: empty document
zERROR: file size
r   zERROR: ri   )r&   rK   rm   rn   r*   r   r   	Exceptionrs   	traceback
format_exc)r   r7   r   r   r   errs         rD   r   r     s     F#D#.

23 M "#j/7;

-. M	OZ9F M  	OJJws3xj93G3G3I2J"MNNM	Os   +A: :	CAC

C)r  N)r  )rZ   r  N)   NN)NN)j__doc__r   r   ImportErrorloggingr|   restringrK   r&  base64r   concurrent.futuresr   r   r   r   	functoolsr   osr	   r
   r   r   	threadingr   typingr   r   r   r   r   r   courlanr   r   r   trafilaturar   baseliner   corer   deduplicationr   	downloadsr   r   r   r   r    feedsr!   metar"   settingsr#   r$   r%   r&   sitemapsr'   utilsr(   r)   r*   r+   r,   	getLogger__name__rO   seedascii_lettersdigitsr~   compiler   r   r   rM   r   rs   rW   r^   rd   boolro   rt   rw   r   r   r   r   r   r   r   r   r   r   r   r   r	  r  r   r#  r   r   re   rD   <module>rG     s  H   	  
  $ T T   ) )  = = : :    ,  "   %  
		8	$ C !!FMM1
BJJy!	"**-.BJJ{#	I 	 +# +$s) +0S SX 
# 
( 
c d $%3 %3 %3 %!s !s !uS#X !YC YC Y "&(
(( ( 	(
 3-( 38_(4	S 	 	c 	3 	 "&)SM)
) ) 	)
 3-) 
),) )	#tT/(B ) QUEEE'*E:B9:ME	E*),7?	7J""-0;D
49c>.( ( (V%)#Y69& $(#'	->
->
-> !-> i 	->
 
->`+ + +.	d3i 	 	 	0# 0( 0s 0@.3 .4 .4 #'	
 
# i 	
 c]C  Hs   I? ?J
	J
