
    (#hs                        d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ dd
lmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+  ejX                  e-      Z.ddhZ/ddhZ0h dZ1h dZ2ddhZ3ddhZ4de5dedee
e6e5f      ddfdZ7dededee   fdZ8dededee   fdZ9ded ed!eddfd"Z:d#edededdfd$Z;d%ed&eddfd'Z<d%ede=fd(Z>d)ed*eddfd+Z?dededee   fd,Z@dede=fd-ZAdedefd.ZBdededee   fd/ZCded0e	e5   dedee   fd1ZDded0e	e5   dedee   fd2ZEd3e=defd4ZFd5ed0e	e5   dedee   fd6ZGdee   dee   fd7ZHded0e	e5   dedee   fd8ZIefd9ed:eded0edef
d;ZJd9ed0e	e5   dedefd<ZKd9ededeee5e	e5   f   fd=ZLd>ededeee5eMf   fd?ZNd%ed0e	e5   dedee   fd@ZOd9ededeee5eMef   fdAZPy)Bz6
Functions related to the main Trafilatura extractor.
    N)deepcopy)AnyOptionalTupleSetUnion)_ElementElement
SubElementstrip_elements
strip_tagstostring)HtmlElement   )delete_by_link_densityhandle_textnodelink_density_test_tablesprocess_nodeprune_unwanted_nodes)TAG_CATALOG	Extractor)FORMATTING_PROTECTEDis_image_filetext_chars_testtrim)delete_element)
BODY_XPATHCOMMENTS_DISCARD_XPATHCOMMENTS_XPATHDISCARD_IMAGE_ELEMENTSOVERALL_DISCARD_XPATHPRECISION_DISCARD_XPATHTEASER_DISCARD_XPATHhireftdth>   r$   r&   r'   >   r$   r%   spancodequoteheadmsgtagtextreturnc           	      V    t         j                  d| |t        |xs d      xs d       y)z/Format extraction event for debugging purposes.z	%s: %s %s NoneN)LOGGERdebugr   )r,   r-   r.   s      W/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/main_extractor.py
_log_eventr6   &   s"    
LLc3TZR(8(BFC    elementoptionsc                    t        |       dk(  rt        | |      }nCt        |       }t        |       D ]*  }t	        ||d      }||j                  |       d|_        , |,t        dj                  |j                                     du r|S y)zProcess head elements (titles)r   Fcomments_fixNdoner1   T)
lenr   r   listr   appendr-   r   joinitertext)r8   r9   titlechildprocessed_childs        r5   handle_titlesrF   +   s    
7|q
 Wg. ! '] 	E .eW5QO*_-EI	 _RWWU^^5E-FG4Or7   c                     t        | |      }|y| j                         }|| j                         }||j                  t        vrt        d      }|j                  d|       |S |}|S )z[Process formatting elements (b, i, etc. converted to hi) found
       outside of paragraphsNpr   )r   	getparentgetpreviousr-   r   r
   insert)r8   r9   
formattingparentprocessed_elements        r5   handle_formattingrO   E   s~     gw/JD  F~$$&~+??#CL  J/  'r7   new_child_elemsubelemprocessed_subchildc                     t        | |j                        }|j                  |j                  c|_        |_        |j                  D ]!  }|j                  ||j                  |          # y)z/Add a sub-element to an existing child element.N)r   r-   r.   tailattribset)rP   rQ   rR   sub_child_elemattrs        r5   add_sub_elementrY   w   s`    0B0F0FGN/A/F/FHZH_H_,N, 74!567r7   rD   c                     | j                   |_         | j                  d      D ]U  }|j                  dk(  r t        ||      }|/|j	                  |       nt        ||d      }|t        |||       d|_        W y)z<Iterate through an element child and rewire its descendants.*r?   NFr;   r=   )r.   iterdescendantsr-   handle_listsr@   r   rY   )rD   rP   r9   rQ   rR   s        r5   process_nested_elementsr^      s    **N((- 	;;& !-gw!?!-%%&89!0'PU!V!-9KL	r7   elemnew_elemc                 P    | j                  d      x}r|j                  d|       yy)z>Copy the rend attribute from an existing element to a new one.rendN)getrV   )r_   r`   	rend_attrs      r5   update_elem_renditionre      s)    HHV$$y$VY' %r7   c                 b    | duxr* t        dj                  | j                                     du S )z"Find if the element contains text.Nr1   T)r   rA   rB   )r_   s    r5   is_text_elementrg      s+    tQ0H IT QQr7   processed_elem	orig_elemc                 |    | :t        || j                        }| j                  | j                  c|_        |_        yy)z&Create a new sub-element if necessary.N)r   r-   r.   rT   )rh   ri   	childelems      r5   define_newelemrl      s;    !y.*<*<=	)7)<)<n>Q>Q&		 "r7   c                 >   t        | j                        }| j                  7| j                  j                         rt	        |d      }| j                  |_        | j                  d      D ]  }t        d      }t        |      dk(  r~t        ||      }|"|j                  xs d|_        |j                  r<|j                  j                         r"|xj                  d|j                  z   z  c_        |j                  |       nt        |||       |j                  |j                  j                         r|D cg c]  }|j                  dk7  s| }}|r_|d   }|j                  |j                  j                         s|j                  |_        n"|xj                  d|j                  z   z  c_        |j                  st        |      dkD  rt        ||       |j                  |       d|_         d| _        t        |      rt        | |       |S yc c}w )z3Process lists elements including their descendants.Nitemr   r1    r=   )r
   r-   r.   stripr   r\   r>   r   rT   r@   r^   re   rg   )	r8   r9   rN   rP   rD   rE   elnew_child_elem_childrenlast_subchilds	            r5   r]   r]      s   ,||GLL$6$6$8#$5v>%ll ((0  u:?*5':O*&5&:&:&@b#"''O,@,@,F,F,H"''31E1E+EE'!((8#E>7Czz%%***:*:*<8F*[""&&TZJZ2*['*[*$;B$?M$))19K9K9Q9Q9S-2ZZ*%**cEJJ.>>*#n"5"9!%8$$^4	-. GK()g'89  ! +\s   <HHc                     | j                  d      s| j                  dk(  ry| j                         }|d|j                  dd      v ry| j                  d      }|t	        |       dk(  ryy)	zECheck if it is a code element according to common structural markers.langr)   T	highlightclassr1   r   F)rc   r-   rI   findr>   )r8   rM   r)   s      r5   is_code_block_elementrz      sl     {{6gkkV3 FkVZZ-DD<<DCLA-r7   c                 d    t        |       }| j                  d      D ]	  }d|_         d|_        |S )z/Turn element into a properly tagged code block.r[   r=   r)   )r   iterr-   )r8   rN   rD   s      r5   handle_code_blocksr}      s;     )c" 	"r7   c                     t        |       rt        |       S t        | j                        }| j	                  d      D ]#  }t        ||      }|t        ||       d|_        % t        |      rt        |d       |S y)zProcess quotes elements.r[   Nr=   r*   )	rz   r}   r
   r-   r|   r   rl   rg   r   )r8   r9   rN   rD   rE   s        r5   handle_quotesr      s    W%!'**,c" &ug6&?,=>		
 ()$g.  r7   potential_tagsc                    | j                   dk(  rd| j                  dd      v rt        |       S | j                   |vr1| j                   dk7  r!t        d| j                   | j                         y| j                   dk(  rZt        | |dd	
      }|It        |j                        d	u r2|j                  j                          |j                   dk(  rd|_         |S y)zAHandle diverse or unknown elements in the scope of relevant tags.divzw3-coderx   r1   r=   zdiscarding elementNFTr<   preserve_spacesrH   )	r-   rc   r}   r6   r.   r   r   rU   clear)r8   r   r9   rN   s       r5   handle_other_elementsr      s     {{e	W[["-E E!'** {{.(;;& +W[[',,G{{e ,GW5bfg(_=N=S=S-TX\-\$$**, $$-(+!%$$r7   c                    | j                   j                          t        |       dk(  rt        | |      S t	        | j
                        }| j                  d      D ]  }|j
                  |vr1|j
                  dk7  r"t        d|j
                  |j                         Ct        ||dd      }||j
                  d	k(  rct        d
d	|j                         |j                  r'|xj                  d|j                  xs dz   z  c_        n|j                  |_        d|_        t	        |j
                        }|j
                  t        v rt        |      dkD  rH|D ]C  }t        |j                        du rd|j                  z   |_        t        ||j
                         E |j
                  dk(  r#|j                  d|j                  dd             nB|j
                  dk(  r3|j                  d      "|j                  d|j                  dd             |j                  |j                  c|_        |_        |j
                  dk(  rt!        |      }||}|j#                  |       d|_         t        |      dkD  r-|d   }	|	j
                  dk(  r|	j                  t%        |	       |S |j                  r|S t        dd	t'        |             y)zIProcess paragraphs along with their children, trim and clean the content.r   r[   r=   zunexpected in pFTr   NrH   z
extra in pro   r1   r$   rb   r%   targetgraphicrp   lbzdiscarding element:)rU   r   r>   r   r
   r-   r|   r6   r.   r   P_FORMATTINGr   r   rV   rc   rT   handle_imager@   r   r   )
r8   r   r9   rN   rD   rE   newsubrn   
image_elem	last_elems
             r5   handle_paragraphsr     s|   NN 7|qGW--  ,c" 999N*uyyF/B(%))UZZ@ *%u^bc&""c)<o.B.BC$))%**c_5I5I5OR.PP*-<-A-A%*"	UYY'F""l2'!+ / >*4995=(+diiDI"?DHH=>
 99$JJvuyy'<=YY%'yy*6

8UYYx-DE& (7';';_=Q=Q$FK""i/)/:
)'F$$V,	s9v !%b)	==D Y^^%;9%    $c84E+FGr7   	is_headerc                 D    t        d      }| r|j                  dd       |S )z1Determine cell element type and mint new element.cellroler+   )r
   rV   )r   cell_elements     r5   define_cell_typer   b  s&     6?L(r7   
table_elemc           
         t        d      }t        | ddd       d}| j                  d      D ]1  }t        |t	        d |j                  t
              D                    }3 d}d}|d	kD  rt        |      nd
}t        d      }	|r|	j                  d|       | j                         D ]  }
|
j                  dk(  rGt        |	      dkD  r|j                  |	       t        d      }	|r|	j                  d|       |xs |}n|
j                  t
        v r~|
j                  dk(  xr | }|xs |}t        |      }t        |
      dk(  r3t        |
|      }||j                  |j                  c|_        |_        n|
j                  |
j                  c|_        |_        d|
_	        |
j                         D ]  }|j                  t         v r)|j                  t
        v rd|_	        t#        ||dd      }n]|j                  dk(  r1|j$                  dk(  r"t'        ||      }|1|j                  |       d}nt)        ||j+                  dg      |      }|t-        ||       d|_	         |j                  st        |      dkD  r#|	j                  |       n|
j                  dk(  r n
d|
_	         |	j.                  j1                  dd       t        |	      dkD  r|j                  |	       t        |      dkD  r|S y)zProcess single table element.tabletheadtbodytfootr   trc              3   R   K   | ]  }t        |j                  d d             ! yw)colspanr   N)intrc   ).0r&   s     r5   	<genexpr>zhandle_table.<locals>.<genexpr>u  s      $^2S	1)=%>$^s   %'Fr   r1   rowr(   r'   Nr=   r   T)r   r<   r?   recallr   )r
   r   r|   maxsumTABLE_ELEMSstrrV   r\   r-   r>   r@   r   r   r.   rT   	TABLE_ALLr   focusr]   handle_textelemunionrl   rU   pop)r   r   r9   newtablemax_colsr   seen_header_rowseen_header	span_attrnewrow
subelementr   rP   processed_cellrD   rR   s                   r5   handle_tabler   k  s   wH z7GW5 Hood# `x$^Q\I]$^!^_` OK!)AH2IU^F

69% 002 1 
>>T!6{Q' JJvy1"1"@[^^{*"$.F3FI%2K-i8N:!#!-j'!B!-?M?R?RTbTgTg<N')< <F??JOO8#^%8!'
'779 'EyyI- 993(.EI-<UG]apt-u*f,(1J-9%-I*-9*112DE15. .=UNDXDXZ_Y`Dacj-k*)5&'9>J &EI''* ""c.&9A&=n-^^w&
c1 h MMfd# 6{Q
8}qr7   c                    | yt        | j                        }dD ]3  }| j                  |d      }t        |      s!|j	                  d|        nT | j
                  j                         D ]6  \  }}|j                  d      st        |      s$|j	                  d|        n | j                  d      x}r|j	                  d|       | j                  d      x}r|j	                  d|       |j
                  r|j                  d      sy|j                  dd      }|j                  d      s'|j	                  dt        j                  d	d
|             |S )z5Process image elements and their relevant attributes.N)data-srcsrcr1   r   r   altrC   httpz^//zhttp://)
r
   r-   rc   r   rV   rU   items
startswithresub)r8   rN   rX   r   valuealt_attr
title_attrsrc_attrs           r5   r   r     sF   ,# 
kk$#!!%-	
 #>>//1 	KD%z*}U/C!%%eU3	 ;;u%%x%eX.[[))z)gz2 ##+<+@+@+G !$$UB/Hv&eRVVFIx%HIr7   c                 |   d}| j                   dk(  rt        | |      }|S | j                   t        v rt        | |      }|S | j                   dk(  rt	        | |      }|S | j                   dk(  rt        | ||      }|S | j                   dk(  rCt        | j                        du r*t        | |      }|t        d      }|j                  |_
        |S | j                   t        v rt        | |      }|S | j                   dk(  rd|v rt        | ||      }|S | j                   dk(  rd|v rt        |       }|S t        | ||      }|S )	z?Process text element and determine how to deal with its contentNr?   r+   rH   r   Tr   r   )r-   r]   CODES_QUOTESr   rF   r   r   rT   r   r
   r.   
FORMATTINGrO   r   r   r   )r8   r   r9   new_elementthis_elements        r5   r   r     si   K{{f"7G4, + 
	$#GW5( ' 
	#GW5$ # 
	'I   
	7<<(D0'9L'%cl#/#4#4   

	"'9  
	G~$="7NGD  
		!i>&A"7+  ,G^WMr7   treeresult_bodyc                 L   t         j                  d       d}j                  dk(  rj                  ddg       |dz  }t	        |       }dvrt        |ddd	       nt        |d	       |j                  |      }|j                  t        d
 fd|D                     |S )zLook for all previously unconsidered wild elements, including outside of the determined
       frame and throughout the document to recover potentially missing text partszRecovering wild text elementsz\.//blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table|.//div[contains(@class, 'w3-code')]r   r   r   z|.//div|.//lb|.//listr%   ar(   c                 
    | d uS N xs    r5   <lambda>z#recover_wild_text.<locals>.<lambda>  s
     r7   c              3   8   K   | ]  }t        |        y wr   r   r   er9   r   s     r5   r   z$recover_wild_text.<locals>.<genexpr>  s$      8* 9H>[b8c 8*   )	r3   r4   r   updateprune_unwanted_sectionsr   xpathextendfilter)r   r   r9   r   search_exprsearch_treesubelemss     ``   r5   recover_wild_textr      s     LL01rK}} udm,..)$HKN";UF3;'  -Hv5 8* (8* + ,r7   c                    |j                   dk(  }t        | t        d      } d|vrt        | t              } |j                   dk7  r"t        | t              } |rt        | t
              } t        d      D ]/  }t        | dd|      } t        | d	d
|      } t        | dd
|      } 1 d|v s|r1| j                  d      D ]  }t        |      du st        |d
        |rot        |       dkD  rC| d   j                  dk(  r1t        | d   d
       t        |       dkD  r| d   j                  dk(  r1t        | dd
d      } t        | dd
d      } | S )z1Rule-based deletion of targeted document sections	precisionT)with_backupr   r      r   )backtrackingfavor_precisionr?   FrH   r   	keep_tailr   rp   r+   r*   )r   r   r!   r    r#   r"   ranger   r|   r   r   r>   r-   )r   r   r9   r   _r_   s         r5   r   r     sP   mm{2O&;ND&#D*@A}} #D*>?'.EFD1X f%dEVef%dFXgh%dCeUdef
 . OIIg& 	6D'-5tu5	6 $i!mb!748u5 $i!mb!7%dFX\]%dG%Y]^Kr7   c                    t        t              j                  du rj                  g d       j                  du rj                  d       j                  du rj                  d       t        d      }t        D ]  }t        d  ||       D        d       }| t        |      }t        |      dk(  r<|j                  d      }j                  d	k(  rd
}nd}|r*t        dj                  |            j                  |z  k  rj                  d       dvrt!        |d       dvrt!        |d       t"        j%                  t'                     |j                  d      }|D ch c]  }|j(                   c}dhk(  r|g}|j+                  fd|D        D 	cg c]  }	|	|		 c}	       t        |      dkD  rI|d   j(                  t,        v r4t/        |d   d       t        |      dkD  r|d   j(                  t,        v r4t        |      d
kD  st"        j%                  t1        t3        |                    n dj                  |j5                               j7                         }
||
fS c c}w c c}	w )NT)r   r&   r'   r   r   r%   bodyc              3   &   K   | ]	  }||  y wr   r   r   ss     r5   r   z_extract.<locals>.<genexpr>D       ?a?   r   z//p//text()r   r      r1   r   r(   .//*r   c              3   8   K   | ]  }t        |        y wr   r   r   s     r5   r   z_extract.<locals>.<genexpr>`  s     )hZ[/!^W*U)hr   rp   Fr   ro   )rV   r   tablesr   imagesaddlinksr
   r   nextr   r>   r   r   rA   min_extracted_sizer   r3   r4   sortedr-   r   NOT_AT_THE_ENDr   r   r   rB   rq   )r   r9   r   exprsubtreeptestfactorr   r   rr   	temp_textr   s    `         @r5   _extractr  7  s:   %N~~9:~~9%}}5!&/K %?4:?F?)'>7Kw<1m,==K'FFBGGEN+g.H.H6.QQu%&w&'w'VN+,==(#$aAEE$.yH)h_g)h{2lnlzB{|+"B(;(;~(M;r?e< +"B(;(;~(M {aLLc$i)K%L --/0668I	>11 % |s   3I?)J
1J
cleaned_treec                 F   t        |       }t        | |      \  }}}t        |      dk(  st        |      |j                  k  r;t	        ||||      }dj                  |j                               j                         }t        |d       t        |d       ||t        |      fS )zFind the main content of a page using a set of XPath expressions,
       then extract relevant elements, strip them of unwanted subparts and
       convert themr   ro   r=   r   )
r   r  r>   r   r   rA   rB   rq   r   r   )r  r9   backup_treer   r  r   s         r5   extract_contentr  l  s    
 <(K-5lG-L*KN ;1I1K1K K'['>ZHH[1134::<	;'{E"	3y>11r7   c                 x    | j                   |v r,t        | |d      }||j                  j                          |S y)z?Process comment node and determine how to deal with its contentTr;   N)r-   r   rU   r   )r_   r   r9   rN   s       r5   process_comments_noder    s@    xx>!+D'M($$**, %$r7   c           
      
   t        d      }t        t              t        D ]  }t	        d  ||       D        d      }|t        |t              }t        |ddd       |j                  t        d fd|j                  d	      D                     t        |      d
kD  st        j                  |       t        |d        n dj                  |j!                               j#                         }||t        |      | fS )z>Try to extract comments out of potential sections in the HTML.r   c              3   &   K   | ]	  }||  y wr   r   r   s     r5   r   z#extract_comments.<locals>.<genexpr>  r   r   Nr   r%   r(   c                 
    | d uS r   r   r   s    r5   r   z"extract_comments.<locals>.<lambda>  s
    atm r7   c              3   8   K   | ]  }t        |        y wr   )r  r   s     r5   r   z#extract_comments.<locals>.<genexpr>  s%       >Ptu>STUWegn>o  >Pr   r   r   Fr   ro   )r
   rV   r   r   r   r   r   r   r   r   r   r>   r3   r4   r   rA   rB   rq   )r   r9   comments_bodyr   r   temp_commentsr   s    `    @r5   extract_commentsr    s   FOM%N ?4:?F?&w0FG7C/ 	V$;  >P  zA  zG  zG  HN  zO  >P  Q  	R}!LL7e4-0 HH]3356<<>M-]);TAAr7   )Q__doc__loggingr   copyr   typingr   r   r   r   r   
lxml.etreer	   r
   r   r   r   r   	lxml.htmlr   htmlprocessingr   r   r   r   r   settingsr   r   utilsr   r   r   r   xmlr   xpathsr   r   r   r    r!   r"   r#   	getLogger__name__r3   r   r   r   r   r   r   r   bytesr6   rF   rO   rY   r^   re   boolrg   rl   r]   rz   r}   r   r   r   r   r   r   r   r   r   r  r   r  r  r  r   r7   r5   <module>r     s    	  3 3 Z Z !3 3 - M M D D D
 
		8	$ e}Tl	"
 %DC Dc D%s
2C)D D D
8 i HX<N 4/x /) /@R /d7H 7x 7U] 7bf 78 X PY ^b  ( (H ( (R( Rt R
R8 R RT R&( &Y &8H;M &R8    X 8 i HX<N $8 SX PY ^fgo^p 6Ox OS OI OZbckZl Od  OX Os3x O) OX`aiXj Od"(8, "(1C "JX s3x ) X`aiXj < kv K h  dg   {C *+ s3x R[ `k D22; 22 22uXsCPSH=T7U 22j2+ 2	 2eHVY[^L^F_ 2. #c( Y [cdl[m B; B BuXsTWYdEd?e Br7   