
    (#h                         d Z ddlZddlmZmZ ddlmZmZmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ d
e
de
fdZdedeeeef   fdZddededefdZy)z<
Module regrouping baseline and basic extraction functions.
    N)AnyTuple)_ElementElement
SubElement)HtmlElement   )BASIC_CLEAN_XPATH)	load_htmltrim)delete_elementtreereturnc                 <    t        |       D ]  }t        |        | S )z-Remove a few section types from the document.)r
   r   )r   elems     Q/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/baseline.pybasic_cleaningr      s#    !$' tK    filecontentc                    t        |       }t        d      }||ddfS d}|j                  d      D ]  }|j                  sd|j                  v s	 t	        j
                  |j                        j                  dd      }|sRd|v r)t        |      }|t        |j                               nd}nt        |      }|t        |d      _        ||rd|z   n|z  } t        |      d	kD  r||t        |      fS t        |      }d}|j                  d
      D ]G  }t        |j                               }t        |      d	kD  s+|t        |d      _        ||rd|z   n|z  }I t        |      dkD  r||t        |      fS t               }	d}|j                  dddddd      D ]N  }
t        |
j                               }||	vs!|t        |d      _        ||rd|z   n|z  }|	j                  |       P t        |      d	kD  r||t        |      fS t        d      }|j!                  d      }|{t        |d      }|j#                         D cg c]  }t        |       }}dj%                  |D cg c]  }|s|	 c}      |_        ||j                  t        |j                        fS t'        |d      }|t        |d      _        ||t        |      fS # t        $ r d}Y \w xY wc c}w c c}w )a)  Use baseline extraction function targeting text paragraphs and/or JSON metadata.

    Args:
        filecontent: HTML code as binary string or string.

    Returns:
        A LXML <body> element containing the extracted paragraphs,
        the main text as string, and its length as integer.

    body r   z&.//script[@type="application/ld+json"]articleBodyz<p>p d   z
.//article
blockquotecodepreqquote.//body
F)clean)r   r   iterfindtextjsonloadsget	Exceptionr   text_contentr   lenr   setiteraddfinditertextjoinhtml2txt)r   r   postbody	temp_textr   	json_bodyparsedr&   article_elemresultselemententry	body_elemp_eleme
text_elemss                   r   baseliner@      s    [!DvH|Q IFG ?99$))3 JJtyy155mRH	 I%&y1F:@:L4 3 3 56RTD	?D15
8S).9S4Z$>	? 9~C	N22$D Il3 ;L--/0t9s?-1Jx%*ytd:I	;
 8}qC	N22 eGI99\63sGL W))+,-2Jx%*	uu<IKK 9~C	N22 vH		)$IHc*'0'9'9';<!d1g<
<iiJ <q! <=c&++&666 D&D%)Jx"T3t9$$m  	^ = <s$   /KK (K%0K%KKcontentr$   c                     t        |       }|y|j                  d      }|y|rt        |      }dj                  |j	                         j                               j                         S )zRun basic html2txt on a document.

    Args:
        content: HTML document as string or LXML element.
        clean: remove potentially undesirable elements.

    Returns:
        The extracted text in the form of a string or an empty string.

    r   r"   r   )r   r0   r   r2   r+   splitstrip)rA   r$   r   r   s       r   r3   r3   h   sc     WD|99YD|d#88D%%'--/06688r   )T)__doc__r'   typingr   r   
lxml.etreer   r   r   	lxml.htmlr   settingsr
   utilsr   r   xmlr   r   strintr@   boolr3    r   r   <module>rP      st   
   4 4 ! ' "   L%# L%%#s(:"; L%^9c 9$ 9# 9r   