
    (#h[                     d   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZmZmZmZmZmZmZ d
dlmZmZ d
dlmZmZmZ  ej@                  e!      Z" ed      Z# e$ ee%      jL                  dz  dz        Z'h dZ(h dZ)da*ddhZ+h dZ, ed      Z-h dZ.h dZ/h dZ0h dZ1g dZ2ddddd Z3d!Z4dNd"ed#e5d$dfd%Z6dOd"ed&e5d$dfd'Z7d(ed$efd)Z8d(ed$efd*Z9dNd+ed,e5d$e$fd-Z:d(ed$efd.Z;d+ed$efd/Z<d0ed1ed$e$fd2Z=d3ed+ed$dfd4Z>d+ed$efd5Z?d6ed7ee$   d$efd8Z@d6ed$e5fd9ZAd"ed&e5d$e$fd:ZBd"ed;ee$   d&e5d$dfd<ZCd=ee   d&e5d$e$fd>ZDd?d@dAd0ed&e5dBe$d@e$d$e$f
dCZEd+ed$efdDZFd+ed$e$fdEZGdFed+ed$efdGZHd"ed$dfdHZId"ed$dfdIZJd"ed$efdJZKdKed$dfdLZLd"ed$dfdMZMy)PzE
All functions related to XML generation, processing and validation.
    N)unescape)version)StringIO)dumps)Path)ListOptional)_ElementElement
SubElement	XMLParser
fromstringtostringDTD   )Document	Extractor)sanitizesanitize_treetext_chars_testtrafilaturadataztei_corpus.dtd>   pabhilbdeldivrefrowbodycellcodeheaditemlistquotetablegraphic>   rendroletypetarget	renditionr   r   >   r   r   r&   r'   r(   T)remove_blank_text>	   r   r   r    r#   r$   r&   r'   r(   r)   >   r   r   r   r$   >	   r   r   r   r    r"   r$   r%   r&   r)   >   r"   r%   noter'   figure)sitenametitleauthordateurlhostnamedescription
categoriestagslicenseidfingerprintlanguagez***__`)z#bz#iz#uz#ti  element	keep_tailreturnc                    | j                         }|y|rc| j                  rW| j                         }|#|j                  xs d| j                  z   |_        n"|j                  xs d| j                  z   |_        |j	                  |        y)z
    Removes this element from the tree, including its children and
    text. The tail text is joined to the previous element or parent.
    N )	getparenttailgetprevioustextremove)rB   rC   parentpreviouss       L/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/xml.pydelete_elementrO   6   su    
  F~W\\&&(!;;,"<FK%]]0bGLL@HM
MM'    include_formattingc                 \   | j                         }|yt        | |      }| j                  || j                  z  }| j                         }|%|j                  r|j                   d| n||_        n*|j                  |j                   d| |_        n||_        |j                  |        y)zAMerge element with its parent and convert formatting to markdown.N )rG   replace_element_textrH   rI   rJ   rK   )rB   rQ   rL   	full_textrM   s        rN   merge_with_parentrV   I   s     F~$W.@AI||W\\!	""$H:B--8==/9+6Y		 Qyk2
MM'rP   treec                 8   | j                  d      D ]  }t        |      dk(  st        |j                        du s*t        |j                        du sB|j                         }|U|j                  dk7  se|j                  dk7  su|j                  |        | S )z"Remove text elements without text.r?   r   Fr)   r#   )iterlenr   rJ   rH   rG   tagrK   )rW   rB   rL   s      rN   remove_empty_elementsr\   ^   s    99S> 'w<1!>%!GO\c\h\hLimrLr&&(F !gkkY&>6::QWCWg&' KrP   c                     t        | j                  d            D ]`  }|j                  ddd      D ]H  }|j                  |j                  k(  s|j	                         j                  t
        vs>t        |       J b | S )z/Prevent nested tags among a fixed list of tags.z.//head | .//code | .//pr#   r$   r   )reversedxpathiterdescendantsr[   rG   NESTING_WHITELISTrV   )rW   elemsubelems      rN   strip_double_tagsrd   j   st    $>?@ +++FFC@ 	+G{{dhh&7+<+<+>+B+BJ[+[!'*	++ KrP   docmetawith_metadatac                 H   |r| j                   D ci c]  }|t        | |d       }}|j                  |j                  d      |j                  d      |j                  d      dj	                  |j                  d      xs g       dj	                  |j                  d      xs g       t        |j                  d      d	
      d       |j                  d      }n%dt        | j                  d	
      i}| j                  }t        |d	
      |d<   t        |d	      S c c}w )z0Build JSON output based on extracted informationNr6   r2   r8   ;r9   r:   r!   F)rQ   )sourcezsource-hostnameexcerptr9   r:   rJ   commentsbodyrJ   comments)ensure_ascii)		__slots__getattrupdatepopjoinxmltotxtr!   rk   
json_dumps)re   rf   slot
outputdictrk   s        rN   build_json_outputrw   s   s    ELEVEVWTdGGT488W
W nnU+)~~j9!~~m4((:>>,#?#E2FHHZ^^F39r:Z^^F3N
 	 "~~n5hw||NO
++%luMJzju55! Xs   Dc                     | j                  d      D ]/  }|j                  t        vs|j                  j	                          1 | S )zRemove unnecessary attributes.r?   )rY   r[   WITH_ATTRIBUTESattribclear)rW   rb   s     rN   clean_attributesr|      s<    		#  88?*KK  KrP   c                    t        d      }t        ||        d| j                  _        |j	                  t        | j                               d| j                  _        |j	                  t        | j                               |S )z4Build XML output tree based on extracted informationdocmainrl   )r   add_xml_metar!   r[   appendr|   rk   re   outputs     rN   build_xml_outputr      sd    U^F!GLL MM"7<<01)G
MM"7#7#789MrP   documentoptionsc                    t        | j                         t        | j                         |j                  dk(  rt        nt
        } ||       }t        |      }t        t        |d      t              }|j                  dk(  r6|j                  r*t        j                  dt        |      |j                         t        |dd      j                         S )z9Make sure the XML output is conform and valid if requiredxmlunicode)encodingxmlteizTEI validation result: %s %sT)pretty_printr   )rd   r!   r\   formatr   build_tei_outputr   r   r   CONTROL_PARSERtei_validationLOGGERdebugvalidate_teiri   strip)r   r   funcoutput_trees       rN   control_xml_outputr      s    hmm$(--(&~~6<LDx.K,KXkIFWK ~~!g&<&<3\+5NPWP^P^_KdYGMMOOrP   r   c                     t         D ]E  }t        ||d      }|s| j                  |t        |t              r|ndj                  |             G y)z-Add extracted metadata to the XML output treeNrh   )META_ATTRIBUTESro   set
isinstancestrrr   )r   re   	attributevalues       rN   r   r      sF    $ X	D1JJy:eS+A%sxxPUWXrP   c                 H    t        |       }t        || j                        }|S )z8Build TEI-XML output tree based on extracted information)write_teitree	check_teir6   r   s     rN   r   r      s%     7#F vw{{+FMrP   xmldocr6   c                    | j                  d      D ]v  }d|_        |j                  dd       |j                         }|/t	        |      dkD  rt        |      }|j                  ||       |}|j                  dk(  slt        |       x | j                  d      D ]J  }|j                  s|j                  j                         s+d|j                  dc|_        |_        |_	        L | j                  d	      D ]  }|j                  t        vr-t        j                  d
|j                  |       t        |       B|j                  t         v rt#        |       n%|j                  dk(  rt%        |       t'        |       |j(                  D cg c]  }|t*        vs| c}D ]?  }t        j                  d||j                  |       |j(                  j-                  |       A  | S c c}w )zCCheck if the resulting XML file is conform and scrub remaining tagsr$   r   r,   headerNr   r   z.//text/body//div/lbz.//text/body//*z"not a TEI element, removing: %s %sr   z0not a valid TEI attribute, removing: %s in %s %s)rY   r[   r   rG   rZ   _tei_handle_complex_headreplace_move_element_one_level_upfindallrH   r   rJ   TEI_VALID_TAGSr   warningrV   TEI_REMOVE_TAIL_handle_unwanted_tails!_handle_text_content_of_div_nodes_wrap_unwanted_siblings_of_divrz   TEI_VALID_ATTRSrq   )r   r6   rb   rL   new_elemar   s          rN   r   r      s    F# -"!>t9q=/5HNN4*D::&t,- 56 B99*-0$))T*DHdiB 01 '88>) NN?3Od#88&"4(XX-d3*40 &*[[MA_4L!M 	'INNMyZ^ZbZbdghKKOOI&	'!'& M Ns   G(G(c                     t         t        t              a t         j                  |       }|du r.t        j                  dt         j                  j                         |S )zUCheck if an XML document is conform to the guidelines of the Text Encoding InitiativeFznot a valid TEI document: %s)TEI_DTDr   
TEI_SCHEMAvalidater   r   	error_log
last_error)r   results     rN   r   r      sI     j/f%F5w7H7H7S7STMrP   c                    | j                   xs d}|r| j                   r| j                  dk(  r)	 t        | j                  d      d         }d|z   d| }n| j                  dk(  rd	| d	}ni| j                  d
k(  r0| j                  d      }|t        v rAt        |    | t        |    }n*| j                  dk(  rd| j                   v rd| d}nd| d}| j                  dk(  ri|rFd| d}| j                  d      }|r	| d| d}nEt        j                  d|| j                         |}n!t        j                  d|| j                         | j                  dk(  r?|r=t        |       dkD  r/| d   j                  dk(  r| j                         | dnd| d}|S | j                  dk(  r|r| j                         | }|S d| }|S | j                  dk(  r|rd| d}|S # t        t
        f$ r d}Y w xY w)zeDetermine element text based on just the text of the element. One must deal with the tail separately.rF   r$   r*   r      #rS   r   z~~r   r#   
z```
z
```rA   r   []r-   ()zmissing link attribute: %s %s'zempty link: %s %sr"   r   r   z| r%   z- )rJ   r[   intget	TypeError
ValueErrorHI_FORMATTINGr   r   rz   rZ   rI   )rB   rQ   	elem_textnumberr*   	link_textr-   s          rN   rT   rT      s5   "Igll;;& W[[034 <.)5I[[E!YKr*I[[D ;;v&D}$,T23I;}T?R>ST	[[F"w||##I;e4		{!,	{{eI;a(I[[*F(k6(!4	?GNN[%	NN.	7>>J{{fs7|a/?1:>>S +2+>+>+@+L9+QTVW`VaabRcI  
	9&-&9&9&;&Gyk	 	 PRR[Q\M]	  
	92&	I z* s   G" "G76G7
returnlistc           	         | j                   r|j                  t        | |             | D ]  }t        |||        | j                   sr| j                  se| j
                  dk(  r^| j                  dd       d| j                  dd       }|j                  d|j                          d| j                  dd       d	       n| j
                  t        v r| j
                  d
k(  rt        | j                  d            }| j                  d      xs | j                  d      }|r|j                         sd}nt        t        |      t              }||k  r|j                  d||z
  z   d       | j                  d      r;|j                  dd|z   d       n"|j                  d       n| j
                  dk7  ry| j
                  t        v r6| j                  d      s%|j                  |r| j
                  d
k7  rdnd       nD| j
                  dk(  r|j                  d       n#| j
                  t        vr|j                  d       | j                  r|j                  | j                         yy)zYRecursively convert a LXML element and its children to a flattened string representation.r)   r3   rF   rS   altz![z](srcr   r    z.//cellcolspanspanr   |r   z./cell[@role='head']z
|z---|r"   Nzancestor::cellu   
␤
z | )rJ   r   rT   process_elementrH   r[   r   r   NEWLINE_ELEMSrZ   r_   isdigitminr   MAX_TABLE_WIDTHSPECIAL_FORMATTING)rB   r   rQ   childrJ   
cell_count	span_infomax_spans           rN   r   r   ,  s   ||.w8JKL ?z+=>? <<;;)#kk'2./qUB1G0HID4::<.7;;ub3I2J!LM[[M){{e# y!9:
#KK	2Igkk&6I	 	(9(9(; H"3y>?CH(%%:0E)F(Gr&JK==!78%%FX,=+>b&AB!!$'[[F" 
 {{m#GMM:J,K*<PUAU,[_`		% 	.	.# ||',,' rP   	xmloutputc                 t    | yg }t        | ||       t        t        dj                  |            xs d      S )zLConvert to plain text format and optionally preserve formatting as markdown.rF   )r   r   r   rr   )r   rQ   r   s      rN   rs   rs   b  s;    JIz+=>HRWWZ017R88rP   	null)delimr   r   c                   t        | j                  |      xs |}t        | j                  |      xs |}t               }t	        j
                  ||t        j                        }|j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  ||| j                  | j                   fD cg c]  }|r|n|
 c}       |j#                         S c c}w )zAConvert the internal XML document representation to a CSV string.)	delimiterquoting)rs   r!   rk   r   csvwriterQUOTE_MINIMALwriterowr6   r<   r=   r7   r3   imager5   r;   pagetypegetvalue)	r   rQ   r   r   posttextcommentstextr   outputwriterds	            rN   xmltocsvr   n  s     '9:BdHH113EFN$L ZF::fs?P?PQL 6 1t+   ??s   C=c                 d   t        dd      }t        ||        t        |d      }t        |d      }t        | j                        }d|_        |j                  dd       |j                  |       t        | j                        }d|_        |j                  dd	       |j                  |       |S )
z6Bundle the extracted post and comments into a TEI treeTEIzhttp://www.tei-c.org/ns/1.0)xmlnsrJ   r!   r   r,   entryrl   )	r   write_fullheaderr   r|   r!   r[   r   r   rk   )re   teidoctextelemtextbodypostbodyrk   s         rN   r   r     s    U"?@FVW%&&)H(F+H-HHLLL!OOH#G$8$89LLVZ(OOL!MrP   c                 Z   | j                   r8| j                  r,| j                  j                          d| j                    d}|S | j                   xs | j                  xs d}t        j	                  t
        j                        r%|dk(  r t        j                  d| j                         |S )z5Construct a publisher string to include in TEI headerz (r   zN/Azno publisher for URL %s)	r7   r2   r   r   isEnabledForloggingWARNINGr   r6   )re   	publishers     rN   _define_publisher_stringr     s    G,,''--/073C3C2DAF	
  $$A(8(8AE	w/I4FNN4gkkBrP   r   c                    t        | d      }t        |d      }t        |d      }|j                  t        |dd      _        |j                  r|j                  t        |d      _        t        |d      }t	        |      }|j
                  r9|t        |d	      _        t        |d
      }|j
                  t        |d      _        nt        |d       t        |d      }|j                  r|j                  t        |dd      _        |j                  t        |dd      _        t        |d      }	t        |	d      }
dj                  t        d|j                  |j                  g            }|s t        j                  d|j                         dj                  t        d|j                  |g            |
_        |t        |	dd      _        t        |	d      }t        |d      }|j                  t        |dd      _        |j                  r|j                  t        |d      _        t        |d      }|t        |d	      _        |j                  rt        |dd|j                         |j                  t        |d      _        t        |d      }t        |d      }|j                  t        |d      _        |j                   s|j"                  rt        |d      }t        |d      }|j                   r,dj                  |j                         t        |d d!      _        |j"                  r,dj                  |j"                        t        |d d"      _        t        |d#      }|j$                  t        |dd$      _        t        |d%      }t        |d&      }t        |d't&        d()      }d(t        |d*      _        t        |dd+,       |S )-z+Write TEI header based on gathered metadata	teiHeaderfileDesc	titleStmtr3   r   )r,   r4   publicationStmtr   availabilityr   	notesStmtr0   r<   r=   
sourceDescbiblz, Nzno sigle for URL %ssiglebiblFullptrURL)r,   r-   r5   profileDescabstract	textClasskeywords,termr9   r:   creationdownloadencodingDescappInfoapplicationTrafilatura)r   identlabelz$https://github.com/adbar/trafilatura)r-   )r   r3   rJ   r4   r   r;   r<   r=   rr   filterr2   r5   r   r   r6   r8   r9   r:   filedatePKG_VERSION)r   re   r   filedescbib_titlestmtpublicationstmt_apublisher_stringr  	notesstmt
sourcedescsource_biblr  biblfullpublicationstmtprofiledescr  	textclassr  r  encodingdescappinfor  s                         rN   r   r     s    ,F&*-Hx5M;B==J}gF38~~3:>>
=(+0"8->?/8:J
$k27!"3^D-4__
<%* 	$c*8[1Izz8?


9f405=D=P=PJy&}5:Hl3JZ0KIIfTG$4$4gll#CDEE,gkk:yyw}}e.D!EFK8=Jz605*j1Hx5M;B==J}gF38~~3:>>
=(+0 +<=O4DJ,1{{?EgkkJ/6||J',V]3K+z2H%,%8%8Jx"W\\{K8	i4CF88GL^L^C_Jxl;@<<=@XXgll=SJxf5:+z2H9@9I9IJxj16fn5Ly1GWm[P]^K,9J{G$){E*PQMrP   c                    | j                   r| j                   j                         rt        |       dkD  rK| d   j                  dk(  r9| j                    d| d   j                   xs d j                         | d   _         n.t	        d      }| j                   |_         | j                  d|       d| _         | j                  r| j                  j                         rt        |       dkD  rK| d   j                  dk(  r9| d   j                   xs d d| j                   j                         | d   _         n-t	        d      }| j                  |_         | j                  |       d| _        yyy)z@Wrap loose text in <div> within <p> elements for TEI conformity.r   r   rS   rF   N)rJ   r   rZ   r[   r   insertrH   r   )rB   	new_childs     rN   r   r     s    ||**,w<!
# 5!(a
0E2/FGMMOGAJOI$\\INNN1i(||**,w<!3 6")"+"2"2"8b!97<<.IOOQGBKI$\\INNN9% -|rP   c                    | j                   r| j                   j                         nd| _         | j                   sy| j                  dk(  r>dj                  t	        d| j
                  | j                   g            | _        d| _         yt        d      }| j                   |_        | j                         }|$|j                  |j                  |       dz   |       d| _         y)z Handle tail on p and ab elementsNr   rS   r   )
rH   r   r[   rr   r  rJ   r   rG   r*  index)rB   new_siblingrL   s      rN   r   r     s    +2<<7<<%%'TGL<<{{cxxtgllGLL-I JK GL cl"<<""$MM&,,w/!3kBGLrP   c                 .   t        d| j                        }| j                  r| j                  j                         nd|_        | j	                         D ]  }|j
                  dk(  rjt        |      dkD  s|j                  r>t        |      dk(  s|d   j                  rt        |d       |j                  |d   _        j|j                  |_        ||j                  |        | j                  r| j                  j                         nd}|r||_        |S )z0Convert certain child elements to <ab> and <lb>.r   )rz   Nr   r   r)  r   )
r   rz   rJ   r   iterchildrenr[   rZ   rH   r   r   )rB   new_elementr   rH   s       rN   r   r     s    $w~~6K/6||w||))+K%%' 
&99;!#{'7'7{#q(KO,@,@{D1',zzB$#(:: u%
& $+<<7<<TDrP   div_elementc                    t        d      }d}| j                         }|y| j                         D ]}  }|j                  dk(  r nl|j                  t        v r'|xs |j                  |      }|j                  |       M|sPt        |      dkD  s_|j                  ||       t        d      }d} |r"t        |      dk7  r|j                  ||       yyy)z=Wrap unwanted siblings of a div element in a new div element.r   Nr   )	r   rG   itersiblingsr[   TEI_DIV_SIBLINGSr-  r   rZ   r*  )r2  r.  new_sibling_indexrL   siblings        rN   r   r   )  s    %.K""$F~++- );;%;;** 1 JV\\'5Jw' !S%5%9/=%en$(!) S-2'5 3rP   c                    | j                         }||j                         nd}||yt        d      }|j                  t        | j	                                      |j                  |j                  |      dz   |        | j                  r| j                  j                         nd}|r||_	        d| _        |j                  r|j                  j                         nd}|r||_        d|_        t        |      dkD  s|j                  s|j                  r$|j                  |j                  |       dz   |       t        |      dk(  r|j                  s|j                  |       yyy)z
    Fix TEI compatibility issues by moving certain p-elems up in the XML tree.
    There is always a n+2 nesting for p-elements with the minimal structure ./TEI/text/body/p
    Nr   r   r   )rG   r   extendr&   r4  r*  r-  rH   r   rJ   rZ   rK   )rB   rL   grand_parentr   rH   s        rN   r   r   B  s-   
  F)/);6##%L~-s|HOOD--/01**62Q6@#*<<7<<TD"(++6;;4D
8}qHMMX]]L..w7!;XF
6{aF# !,rP   )T)F)N__doc__r   r   htmlr   importlib.metadatar   ior   jsonr   rt   pathlibr   typingr   r	   
lxml.etreer
   r   r   r   r   r   r   settingsr   r   utilsr   r   r   	getLogger__name__r   r  r   __file__rL   r   r   r   r   r   r5  r   r   r   ry   ra   r   r   r   boolrO   rV   r\   rd   rw   r|   r   r   r   r   r   r   rT   r   rs   r   r   r   r   r   r   r   r   r    rP   rN   <module>rJ     s/      &  $  !3 3 3 * ; ; 
		8	$m$ h&&/2BBC
MA
+8 T2W1 X?  3d#>H   &x T d *	 	X 	H  6x 6 6 6,8  h 8 P PI P# P&X XH X Xh 8 'h 'Xc] 'x 'T d ,( , , ,^3(X 3(49 3(RV 3([_ 3(l	9* 	9 	9 	9 LP]c x T S WZ hk 68  &h 3 DX D DX DNx D *H  "h 8 *6 6T 62$ $T $rP   