
    (#hH              8          d Z ddlZddlZddlmZmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4  ejj                  e6      Z7ddhZ8de"de&de9fdZ:dededede&de
ee9e;f   f
dZ<ddddddddddddddddddddde%dfded ee9   d!e=d"e=d#e=d$e=d%e=d&e9d'ee9   d(e=d)e=d*e=d+e=d,e=d-eee9ef      d.e=d/e=d0ee;   d1ee	e9      d2ee	e9      d3e=d4ee   d5edee&   deee"ee9ef   f      f2d6Z>ddddddddddddddddddddddde%dfded ee9   d7ee9   d!e=d"e=d#e=d$e=d%e=d&e9d8e=d'ee9   d(e=d)e=d*e=d+e=d,e=d-eee9ef      d.e=d/e=d0ee;   d1ee	e9      d2ee	e9      d9ee9   d4ee   d5edee&   dee9   f6d:Z?y);z4
Extraction configuration and processing functions.
    N)copydeepcopy)AnyDictOptionalSetTupleUnion)_ElementElementXPath
strip_tags)HtmlElement   )baseline)content_fingerprintduplicate_test)compare_extraction)build_html_outputconvert_tagsprune_unwanted_nodestree_cleaning)extract_commentsextract_content)Documentextract_metadata)DEFAULT_CONFIG	Extractor
use_config)LANGID_FLAGcheck_html_langlanguage_filter	load_htmlnormalize_unicode)build_json_outputcontrol_xml_outputxmltotxtxmltocsv)REMOVE_COMMENTS_XPATHmarkdowntxtdocumentoptionsreturnc           
         d|j                   v r| j                  j                  d      D ]o  }|j                  dk7  st	        |      dk(  s"|j
                  r/|j                  r<|j                         }|O|j                  dk7  s_|j                  |       q t        | |      }t)        |      S |j                   dk(  r!t        | |j                        }t)        |      S |j                   dk(  r!t        | |j                        }t)        |      S |j                   dk(  r!t        | |j                        }t)        |      S |j                  r:d	}d
D ]-  }t        | |      s|| dt!        t        | |             dz  }/ |d	z  }nd}| t#        | j                  |j                         }| j$                  3| dt#        | j$                  |j                         j'                         }t)        |      S )zMConvert XML tree to chosen format, clean the result and output it as a stringxml*graphicr   codecsvjsonhtmlz---
)titleauthorurlhostnamedescriptionsitenamedate
categoriestagsfingerprintidlicensez: 
 )formatbodyitertaglentexttail	getparentremover&   r(   
formattingr%   with_metadatar   getattrstrr'   commentsbodystripr$   )r,   r-   elementparentreturnstringheaderattrs          M/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/core.pydetermine_returnstringrZ   ,   s    }}))#. 
	+Gy(LA% **,%&***>MM'*
	+ *(G<J \**G 
5	 '*<*<=D \**A 
6	!(73H3HI> \**; 
6	!(73H3HI8 \**3   F J 8T*bWXt-D)E(FbIIFJ  gFF (8==':L:L"M!NO  ,*^2hx7L7LgN`N`.a-bciikL\**    cleaned_treecleaned_tree_backuptree_backupc                    t        | |      \  }}}|j                  st        |t        |      ||||      \  }}}||j                  k  r=|j
                  dk(  s.t        t        |            \  }}}t        j                  d|       |||fS )z?Execute the standard cascade of extractors used by Trafilatura.	precisionz+non-clean extracted length: %s (extraction))	r   fastr   r   min_extracted_sizefocusr   LOGGERdebug)r\   r]   r^   r-   postbody	temp_textlen_texts          rY   trafilatura_sequenceri   e   s     %4L'$J!Hi <<(:[!)
%)X ',,,W]]k5Q(0+1F(G%)XBHMY((r[   FTpythonfilecontentr9   ra   no_fallbackfavor_precisionfavor_recallinclude_commentsoutput_formattarget_languageinclude_tablesinclude_imagesinclude_formattinginclude_linksdeduplicatedate_extraction_paramsrO   only_with_metadatamax_tree_sizeurl_blacklistauthor_blacklistas_dictprune_xpathconfigc           	      
   |r|}t        j                  dt               |rt        j                  dt               |rt        d      |rt	        |t
              s?t        d)i d|d|d|d|d|d	|d
|d|d|
d|	d|d|d|d|d|d|d|d|}	 t        |       }|t        j                  d|       t        |j                  rP|j                  st        s>t        ||j                        du r&t        j                  d|j                         t        |j                  rt        ||j                   |j"                  |j                  |j$                        }|j                   |j&                  v r&t        j)                  d|j                          t        |j*                  rT|j,                  r|j.                  r|j                   s0t        j                  d|j                         t        t1               }|6t	        |t2              r|g}t5        ||D cg c]  }t7        |       c}      }t9        t;        |      |      }t;        |      }t=        |||j                   xs |j                         }|j>                  rtA        ||      \  }}}}ntC        d      dd}}}|jD                  dk(  rt5        |tF              }tI        ||||      \  } }!}"|jJ                  rtM        |       |jJ                  kD  r+t        jO                  dtM        |              tQ        | d        tM        |       |jJ                  kD  r0t        jO                  d!tM        |       |j                         t        |j>                  r/||jR                  k  r t        jO                  d"|j                         |"|jT                  k  r7||jV                  k  r(t        jO                  d#|"||j                         t        |jX                  r4t[        | |      d$u r&t        jO                  d%|j                         t        |j                  rEt]        |!||j                  |      \  }#}|#d$u r&t        jO                  d&|j                         t        |j`                  d(k(  r[tc        | |jd                        |_3        |j>                  r"tc        ||jd                        |_        ||_4        |jf                  |_5        n|!|c|_5        |_4        | |_6        |s|S |jo                         S c c}w # t^        t        f$ r# t        j)                  d'|j                         Y yw xY w)*al  Internal function for text extraction returning bare Python variables.

    Args:
        filecontent: HTML code as string.
        url: URL of the webpage.
        fast: Use faster heuristics and skip backup extraction.
        no_fallback: Will be deprecated, use "fast" instead.
        favor_precision: prefer less text but correct extraction.
        favor_recall: prefer more text even when unsure.
        include_comments: Extract comments along with the main text.
        output_format: Define an output format, Python being the default
            and the interest of this internal function.
            Other values: "csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
        target_language: Define a language to discard invalid documents (ISO 639-1 format).
        include_tables: Take into account information within the HTML <table> element.
        include_images: Take images into account (experimental).
        include_formatting: Keep structural elements related to formatting
            (present in XML format, converted to markdown otherwise).
        include_links: Keep links along with their targets (experimental).
        deduplicate: Remove duplicate segments and documents.
        date_extraction_params: Provide extraction parameters to htmldate as dict().
        with_metadata: Extract metadata fields and add them to the output.
        only_with_metadata: Only keep documents featuring all essential metadata
            (date, title, url).
        url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
        as_dict: Will be deprecated, use the .as_dict() method of the document class.
        prune_xpath: Provide an XPath expression to prune the tree before extraction.
            can be str or list of str.
        config: Directly provide a configparser configuration.
        options: Directly provide a whole extractor configuration.

    Returns:
        A Python dict() containing all the extracted information or None.

    Raises:
        ValueError: Extraction problem.
    H"no_fallback" will be deprecated in a future version, use "fast" insteadzR"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results:max_tree_size is deprecated, use settings.cfg file insteadr~   rp   ra   r`   recallcommentsrN   linksimagestablesdeduplangr9   rO   rx   r{   rz   date_paramsNzempty HTML tree: %sFzwrong HTML meta language: %szblacklisted URL: %szno metadata: %srF   rD   r   zoutput tree too long: %shiz'output tree too long: %s, discarding %sznot enough comments: %sz+text and comments not long enough: %s %s %sTz!discarding duplicate document: %szwrong language: %szdiscarding data: %srj    )8warningswarnPendingDeprecationWarning
ValueError
isinstancer   r#   rd   errorr   ra   r    r!   sourcerO   r   r9   r   r{   rz   warningrx   r=   r7   r   rQ   r   r   r   r   r   r   r   r   rc   r)   ri   ry   rI   re   r   min_extracted_comm_sizemin_output_sizemin_output_comm_sizer   r   r"   	TypeErrorrE   r'   rN   rJ   rR   raw_textrF   r|   )$rk   r9   ra   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rO   rx   ry   rz   r{   r|   r}   r~   r-   treer,   xr\   r]   rR   temp_commentslen_commentsrf   rg   rh   is_not_target_langs$                                       rY   bare_extractionr      s   D V%	
 `%	
 UVV *Wi8 

'
 
 &	

  
 &
 *
  
 "
 "
 
 !
 
 (
  2
  .!
" (#
$ /%
*p%<LL.4 <<W\\tW\\2e;;W^^L     '##((H ||w4444hllC   ))(..X\\.?    zH "+s+*m'.MAuQx.MND %T$Z9"<0 $L'7;;;V(,,W FVgGCL-| 9@Q-L==K'/>STL(<-tW)
%)X
   8}w4447XG8T*8}w444=MNN
 ! w/N/N NLL2GNNCw...w;;;LL=	  ==^Hg>$FLL<gnnM <<+:=',,,( "T)17>>B   ~~! 7+=+=> (w7I7I JH$0H!$MM3<l080HM"8:(8(8(::c /ND z" ,gnn=s&   FT T
1IT T /UU	record_idtei_validationsettingsfilec           	      f   |r|}t        j                  dt               |rt        d      |rt	        |t
              sLt        di dt        ||      d|d|d|d|d|d	|d
|d|d|d|d|
d|d|d|d|	d|d|d|}t        | |d|      }|rt	        |t              sy|j                  t        vri|j                  dk(  rt        d      ||_        |j                  <t        t        |j                        dz   t        |j                        z         |_        t#        ||      S )a[  Main function exposed by the package:
       Wrapper for text extraction and conversion to chosen output format.

    Args:
        filecontent: HTML code as string.
        url: URL of the webpage.
        record_id: Add an ID to the metadata.
        fast: Use faster heuristics and skip backup extraction.
        no_fallback: Will be deprecated, use "fast" instead.
        favor_precision: prefer less text but correct extraction.
        favor_recall: when unsure, prefer more text.
        include_comments: Extract comments along with the main text.
        output_format: Define an output format:
            "csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
        tei_validation: Validate the XML-TEI output with respect to the TEI standard.
        target_language: Define a language to discard invalid documents (ISO 639-1 format).
        include_tables: Take into account information within the HTML <table> element.
        include_images: Take images into account (experimental).
        include_formatting: Keep structural elements related to formatting
            (only valuable if output_format is set to XML).
        include_links: Keep links along with their targets (experimental).
        deduplicate: Remove duplicate segments and documents.
        date_extraction_params: Provide extraction parameters to htmldate as dict().
        with_metadata: Extract metadata fields and add them to the output.
        only_with_metadata: Only keep documents featuring all essential metadata
            (date, title, url).
        url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
        settingsfile: Use a configuration file to override the standard settings.
        prune_xpath: Provide an XPath expression to prune the tree before extraction.
            can be str or list of str.
        config: Directly provide a configparser configuration.
        options: Directly provide a whole extractor configuration.

    Returns:
        A string in the desired format or None.

    r   r   r~   rp   ra   r`   r   r   rN   r   r   r   r   r   r9   rO   rx   r   r{   rz   r   F)r-   r|   r}   Nrj   z9'python' format only usable in bare_extraction() function r   )r   r   r   r   r   r   r   r   r   rE   TXT_FORMATSrA   r   r   rQ   r7   r@   rZ   )rk   r9   r   ra   rl   rm   rn   ro   rp   r   rq   rr   rs   rt   ru   rv   rw   rO   rx   ry   rz   r{   r   r}   r~   r-   r,   s                              rY   extractr   i  s   D V%	

 UVV *Wi8 
lF3
'
 
 &	

  
 &
 *
  
 "
 "
 
 !
 
 (
  2
  *!
" .#
$ (%
& /'
. 	H :h9~~[(>>X%K   (#6HNN#c)C0A0A,BB$H 
 "(G44r[   )@__doc__loggingr   r   r   typingr   r   r   r   r	   r
   
lxml.etreer   r   r   r   	lxml.htmlr   r   deduplicationr   r   externalr   htmlprocessingr   r   r   r   main_extractorr   r   metadatar   r   settingsr   r   r   utilsr    r!   r"   r#   r$   r0   r%   r&   r'   r(   xpathsr)   	getLogger__name__rd   r   rQ   rZ   intri   boolr   r   r   r[   rY   <module>r      s5      9 9 ; ; !  > (  > 0 ; ;  K J ) 
		8	$5!6+X 6+	 6+c 6+r))$) ) 	)
 8S#)> !!!%) $7;$#'(,+/!% #'1d;d;	#d; d; 	d;
 d; d; d; d; c]d; d; d; d; d; d; %T#s(^4d;  !d;" #d;$ C=%d;& CH%'d;( s3x()d;* +d;, #-d;. /d;0 i 1d;2 eHd38n,-.3d;R #!! %) $7;$#'(,+/"&!% #'555	#5 }5 	5
 5 5 5 5 5 5 c]5 5 5 5 5  !5" %T#s(^4#5$ %5& '5( C=)5* CH%+5, s3x(-5. 3-/50 #152 354 i 556 c]75r[   