
    (#h0                        d Z ddlmZ ddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZ 	 ddlmZ  e ed            Zdd	lmZ dd
lmZmZmZ ddlmZ g dZ ee      dhz  Z	 d`de
e   de
e   defdZ e       ZdddddddddZ  G d d      Z!dadede
e   de!fd Z"dbd!e#deeef   fd"Z$ G d# d$      Z% e&ed%      Z'd&Z(d'Z)d(Z*d)Z+d*Z,h d+Z-g d,Z.g d-Z/ ed.      Z0 e1g d/      Z2i d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_Z3y# e$ r ddlmZ  e       xs dZY 
w xY w)cz<
Listing a series of settings that are applied module-wide.
    )ConfigParser)datetime)unescape)AnyDictListOptionalSet)sched_getaffinity)	cpu_count   )Path)_ElementElementXPath)line_processing)csvjsonhtmlmarkdowntxtxmlxmlteipythonNfilenameconfigreturnc                     ||S | &t        t        t              j                  dz        } n$t        |       j	                         st        d      t               }|j                  |        |S )zE
    Use configuration object or read and parse a settings file.
    zsettings.cfgz$The given config file does not exist)strr   __file__parentis_fileFileNotFoundErrorr   read)r   r   s     Q/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/settings.py
use_configr&      sb     tH~,,~=>(^##% FGG^F
KKM    MIN_EXTRACTED_SIZEMIN_OUTPUT_SIZEMIN_OUTPUT_COMM_SIZEMIN_EXTRACTED_COMM_SIZEMIN_DUPLCHECK_SIZEMAX_REPETITIONSMAX_FILE_SIZEMIN_FILE_SIZE)min_extracted_sizemin_output_sizemin_output_comm_sizemin_extracted_comm_sizemin_duplcheck_sizemax_repetitionsmax_file_sizemin_file_sizec            ,          e Zd ZdZg dZeddddddddddddddddddddded	ed
ededededededededede	e   de	e   de	e   dededede	e
e      de	e
e      de	eeef      f(dZde	e   de	e   ddfdZdeddfd Zdeddfd!Zy)"	Extractorz0Defines a class to store all extraction options.)r   formatfastfocuscomments
formattinglinksimagestablesdeduplangr0   r1   r2   r3   r4   r5   r6   r7   max_tree_sizesourceurlwith_metadataonly_with_metadatatei_validationdate_paramsauthor_blacklisturl_blacklistr   FTN)r   output_formatr;   	precisionrecallr=   r>   r?   r@   rA   rB   rC   rF   rE   rG   rH   rI   rK   rL   rJ   r   rM   r;   rN   rO   r=   r>   r?   r@   rA   rB   rC   rF   rE   rG   rH   rI   rK   rL   rJ   c                :   | j                  ||       | j                  |       | j                  |       || _        |rdn|rdnd| _        || _        |xs | j                  dk(  | _        || _        |	| _	        |
| _
        || _        || _        || _        || _        || _        |xs
 t!               | _        |xs
 t!               | _        |xs |xs t'        |      xs |dk(  | _        |xs% t+        | j,                  j/                  dd            | _        d | _        y )NrO   rN   balancedr   r   DEFAULTEXTENSIVE_DATE_SEARCH)_set_source_set_format_add_configr;   r<   r=   r:   r>   r?   r@   rA   rB   rC   rF   rH   rI   setrK   rL   boolrG   set_date_paramsr   
getbooleanrJ   rD   )selfr   rM   r;   rN   rO   r=   r>   r?   r@   rA   rB   rC   rF   rE   rG   rH   rI   rK   rL   rJ   s                        r%   __init__zExtractor.__init__e   s#   0 	f%' 	H9K* 	
 ' * GdkkZ.G 
"" 
#'	"%(:$2*:*Cce'4'= )!)M") (	 	 ,7 ,
/KK""9.EF;
 "r'   r   c                 d    |xs |}|xr! |j                  dd      j                  d      | _        y)z)Set the source attribute in a robust way.zutf-8replaceN)encodedecoderE   )r[   rF   rE   s      r%   rT   zExtractor._set_source   s-    Rw	!B!I!I'!Rr'   chosen_formatc                 v    |t         vr*t        ddj                  t        t                            || _        y)z;Store the format if supported and raise an error otherwise.z#Cannot set format, must be one of: z, N)SUPPORTED_FORMATSAttributeErrorjoinsortedr:   )r[   ra   s     r%   rU   zExtractor._set_format   s;     11 5diiGX@Y6Z5[\  $r'   c           	          t         j                         D ]"  \  }}t        | ||j                  d|             $ || _        y)z&Store options loaded from config file.rR   N)CONFIG_MAPPINGitemssetattrgetintr   )r[   r   keyvalues       r%   rV   zExtractor._add_config   s>    (..0 	@JCD#v}}Y>?	@r'   )__name__
__module____qualname____doc__	__slots__DEFAULT_CONFIGr   r   rX   r	   r
   r   r\   rT   rU   rV    r'   r%   r9   r9   ?   s   6"IN  ." "! $##($/3,004-4" 4" 	4"
 4" 4" 4" 4" 4" 4" 4" 4" 4" sm4" c]4"  !4"" #4"$ !%4"& '4"( #3s8,)4"*  C)+4", d38n--4"lSx} Shsm S S
$ $ $, 4 r'   r9   argsrF   c                 v   t        t        | j                        | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  || j                  | j                  | j                        }dD ]  }t        ||t        | |              |S )z-Derive extractor configuration from CLI args.)r   )r   rM   r>   rN   rO   r=   rA   rB   rC   rF   rG   rH   rI   )r;   r@   r?   )r9   r&   config_filerM   r>   rN   rO   no_comments	no_tablesdeduplicatetarget_languagerG   rH   validate_teirj   getattr)ru   rF   optionsattrs       r%   args_to_extractorr      s    4#3#34((??..{{!!~~!!((22((G , 4wtT234Nr'   	extensivec                 P    d| t        j                         j                  d      dS )z/Provide default parameters for date extraction.Tz%Y-%m-%d)original_dateextensive_searchmax_date)r   nowstrftime)r   s    r%   rY   rY      s(     %LLN++J7 r'   c            ,          e Zd ZdZg dZdddddddddddd ed      d ed      ddddddddee   dee   dee   d	ee   d
ee   dee   dee   deee      deee      dee   dee   dee   de	dee   de	dee   dee   dee   dee   dee   dee   f*dZ
edeeef   dd fd       Zd dZdeeee   f   fdZy)!DocumentzZDefines a class to store all necessary data and metadata fields for extracted information.titleauthorrF   hostnamedescriptionsitenamedate
categoriestagsfingerprintidlicensebodyr=   commentsbodyraw_texttextlanguageimagepagetypefiledateNr   )r   r   rF   r   r   r   r   r   r   r   idvallicense_valr   r=   r   r   r   r   r   r   r   r   r   rF   r   r   r   r   r   r   r   r   r   r=   r   r   r   r   r   r   r   c                *   || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        y Nr   )r[   r   r   rF   r   r   r   r   r   r   r   r   r   r   r=   r   r   r   r   r   r   r   s                         r%   r\   zDocument.__init__   s    2 %*
%+"%'/*5'/#'	/9)-	*5!&&1"	'/&2'/#'	'/$)
'/'/r'   datar   c                 ^     |        }|j                         D ]  \  }}t        |||        |S )z.Set a series of attributes using a dictionary.)ri   rj   )clsr   docrl   rm   s        r%   	from_dictzDocument.from_dict  s5     e**, 	%JCCe$	%
r'   c                     | j                   D ]V  }t        | |      }t        |t              s t	        |      dkD  r|dd dz   }t        t        |            }t        | ||       X y)z*Limit text length and trim the attributes.'  Ni'  u   …)rr   r}   
isinstancer   lenr   r   rj   )r[   slotrm   s      r%   clean_and_trimzDocument.clean_and_trim!  sa    NN 	+DD$'E%%u:%!%4L50E'8dE*	+r'   c           	      X    | j                   D ci c]  }|t        | |d       c}S c c}w )z%Convert the document to a dictionary.N)rr   r}   )r[   r   s     r%   as_dictzDocument.as_dict-  s(    <@NNKDgdD$//KKKs   ')r   N)rn   ro   rp   rq   rr   r   r	   r   r   r   r\   classmethodr   r   r   r   r   rt   r'   r%   r   r      s   `I8  $ $!"&%)"&"*.$(%)#%) "&!("&""&#"&"&/-0 }-0 	-0
 c]-0 3--0 c]-0 3--0 sm-0 T#Y'-0 tCy!-0 c]-0 }-0 c]-0 -0  3-!-0" #-0$ 3-%-0& sm'-0( 3-)-0* }+-0, 3---0. 3-/-0^ T#s(^ 
  
+Lc8C=01 Lr'   r      i   i     i@B r   >   bipqdddtemh1h2h3h4h5h6lidivpremainspanstrongarticlesection
blockquote)3asideembedfooterformheadiframemenuobjectscriptappletaudiocanvasfiguremappicturesvgvideoareablinkbuttondatalistdialogframeframesetfieldsetlinkinputinslabellegendmarqueemathmenuitemnavnoindexnoscriptoptgroupoptionoutputparamprogressrprtrtcselectrE   styletracktextareatimeuse)abbracronymaddressbdibdobigciter   dfnfonthgroupimgr   markmetarubysmalltbodytemplatetfoottheadzL.//aside|.//div[contains(@class|@id, 'footer')]|.//footer|.//script|.//style)
r   codedelr   hilblistr   r   quotearArabicbg	BulgarianczCzechdaDanishdeGermanenEnglishelGreekesSpanishfaPersianfiFinnishfrFrenchhrCroatianhu	HungariankoKoreanr   
IndonesianitItaliannoNorwegian_NynorskDutchPolish
PortugueseRomanianRussianSlovak	SlovenianSerbianSwedishTurkish	UkrainianUrdu
Vietnamese)nlplptroruskslsrsvtrukurvi)NNr   )T)4rq   configparserr   r   r   r   typingr   r   r   r	   r
   osr   r   	CPU_COUNTImportErrorr   pathlibr   
lxml.etreer   r   r   utilsr   SUPPORTED_FMT_CLIrW   rc   r   r&   rs   rh   r9   r   rX   rY   r   minPARALLEL_CORESLRU_SIZEMAX_FILES_PER_DIRECTORYFILENAME_LEN	MAX_LINKSMAX_SITEMAPS_SEENCUT_EMPTY_ELEMSMANUALLY_CLEANEDMANUALLY_STRIPPEDBASIC_CLEAN_XPATH	frozensetTAG_CATALOGJUSTEXT_LANGUAGESrt   r'   r%   <module>re     s   &   1 1!$%a()I
  / / " P )*hZ7  FJsm,4\,B&  /(28.($$	m m`C hsm y ,t tCH~ `L `LH Y#   	 :7 t 2 R  R!(!+! 	'! 	(	!
 	(! 	)! 	'! 	)! 	)! 	)! 	(! 	*! 	+! 	(!  	,!!" 	)#!$ 	
%!& 











?! Y  ! qI!s   D) )EE