
    (#hJ                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZmZ d
dlmZ d
dlm Z m!Z!m"Z"m#Z# d
dl$m%Z%m&Z& d
dl'm(Z(m)Z)m*Z*m+Z+ d
dl,m-Z-m.Z.m/Z/m0Z0m1Z1 dgZ2 ejf                  e4      Z5 ejf                  d      jm                  ejn                          ejp                  d      Z9 ejp                  d      Z: ejp                  d      Z; ejp                  d      Z< ejp                  d      Z= ejp                  dej|                        Z?h dZ@h dZAh dZBh dZCh dZDddhZEh dZFd d!hZGd"d#hZHh d$ZId%d&d'd(d(d(d)d*ZJd+d,hZKg d-ZLd.eMd/eMfd0ZNd1eMd2eeM   d/eeM   fd3ZOd4ed5e%d/e%fd6ZPd4ed/e
eMeeM   f   fd7ZQd4ed/e%fd8ZR	 dLd4ed9ee   d:eSd/eeM   fd;ZTd4ed/eeMeeM   eeM   f   fd<ZUd4ed/eeM   fd=ZVd4ed/eeM   fd>ZWdMd4ed?eeM   d/eeM   fd@ZXd4ed/eeM   fdAZYdBeMd4ed/eeM   fdCZZdNdDedEe[d/eeM   fdFZ\d4ed/eeM   fdGZ]	 	 	 	 dOdHeeeMf   d?eeM   dIee	   dJe[d2eeeM      d/e%fdKZ^y)PzH
Module bundling all functions needed to scrape metadata from webpages.
    N)deepcopy)unescape)AnyDictListOptionalSetTupleUnion)extract_domainget_base_urlis_valid_urlnormalize_urlvalidate_url)	find_date)XPath)HtmlElementtostring   )prune_unwanted_nodes)extract_jsonextract_json_parse_errornormalize_authorsnormalize_json)Documentset_date_params)HTML_STRIP_TAGSline_processing	load_htmltrim)AUTHOR_DISCARD_XPATHSAUTHOR_XPATHSCATEGORIES_XPATHSTAGS_XPATHSTITLE_XPATHSr   htmldatez$https?://(?:www\.|w[0-9]+\.)?([^/]+)z("(?:\\"|[^"])*")|\su5   ^(.+)?\s+[–•·—|⁄*⋆~‹«<›»>:-]\s+(.+)$z["\']z=/(by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero)/([1-9]\.[0-9])zT(cc|creative commons) (by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero) ?([1-9]\.[0-9])?>   
dc.creator
dc:creatordcsext.authoratc-metaauthordc.creator.autparsely-authordcterms.creatorsailthru.authordcterms.creator.autshareaholic:article_author_namebylauthorauthorscreator	rbauthorscitation_authorarticle:author>   dc.descriptiondc:descriptiondcterms.abstractdcterms.descriptiontwitter:descriptionsailthru.descriptiondescription>
   dc.publisherdc:publisherdcterms.publishersailthru.publisher	copyright	publisher	rbpubnamecitation_journal_titletwitter:sitearticle:publisher>   parsely-tagsdcterms.subjectshareaholic:keywordstagskeywordscitation_keywords>   dc.titledcterms.titleparsely-titletwitter:titlesailthru.titleshareaholic:titletitlerbtitlefb_titleheadlinecitation_title	rbmainurltwitter:url>   twitter:imagetwitter:image:srcimageog:imageog:image:urlog:image:secure_urlr2   r7   rG   zapplication-name>   
http-equivcharsetpropertyrU   r>   sitenamer^   pagetype)zog:titlezog:descriptionzog:site_namer_   r`   ra   zog:typez	og:authorzog:article:author)z.//head//link[@rel="canonical"]z.//head//basez6.//head//link[@rel="alternate"][@hreflang="x-default"]rL   returnc                     t        t        |             }|syt        j                  d|      } dj	                  t        d| j                  d                  S )z!Remove special characters of tags z, N)r    r   CLEAN_META_TAGSsubjoinfiltersplit)rL   trimmeds     Q/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/metadata.pynormalize_tagsrq      sG    8D>"GsG,D99VD$**T"2344    r3   author_blacklistc                 .   |D ch c]  }|j                          }}| j                  d      D cg c]2  }|j                         j                         |vr|j                         4 }}|r dj                  |      j                  d      S yc c}w c c}w )z:Check if the authors string correspond to expected values.;z; N)lowerrn   striprl   )r3   rs   ar2   new_authorss        rp   check_authorsrz      s    +;<a	<< mmC(<<>!)99 	K 
 yy%++D11 =s
   B7Btreemetadatac                 ,   | j                  d      D ]Z  }|j                  st        t        j	                  d|j                              }	 t        j                  |      }t        ||      }\ |S # t
        j                  $ r t        ||      }Y w xY w)z,Parse and extract metadata from JSON-LD datazK.//script[@type="application/ld+json" or @type="application/settings+json"]z\1)
xpathtextr   JSON_MINIFYrk   jsonloadsr   JSONDecodeErrorr   )r{   r|   elemelement_textschemas        rp   extract_meta_jsonr      s    

U 
H yy%kooeTYY&GH	HZZ-F#FH5H
H O ## 	H/hGH	Hs   !A11BBc                 N   t         j                  d      }| j                  d      D ]{  }|j                  d      |j                  d      }}|s(|j	                         r9|t
        v r||t
        |   <   N|dk(  rt        |      r||d<   d|t        v smt        d|      |d<   } |S )	zESearch meta tags following the OpenGraph guidelines (https://ogp.me/))rU   r2   urlr>   re   r^   rf   z+.//head/meta[starts-with(@property, "og:")]rd   contentzog:urlr   Nr2   )	dictfromkeysr~   getisspaceOG_PROPERTIESr   	OG_AUTHORr   )r{   resultr   property_namer   s        rp   extract_opengraphr      s    ]]RF
 

HI 	D!%*!5txx	7Jw7??,-7>}]34(*|G/D 'u)+#4T7#Cx 	D Mrr   c           	         t               j                  t        |             }t        |j                  |j
                  |j                  |j                  |j                  |j                  f      r|S g d}}| j                  d      D ]  t        j                  dj                  dd            j                         }|s;dj                  v rj                  dd      j!                         }|j#                  d      r{|dk(  r|j%                  t'        |             |t(        v rt+        |j
                  |      |_        |dk(  r|j                  xs ||_        |t,        v s|j                  xs ||_	        d	j                  v rj                  d	d      j!                         }|t.        v rt+        |j
                  |      |_        L|t0        v r|j                  xs ||_        k|t2        v r|j                  xs ||_        |t4        v r|j                  xs ||_        |t6        v sd
|v r|}|dk(  r |j                  st9        |      r	||_        |t:        v s|j%                  t'        |             dj                  v r|j                  dd      j!                         }|dk(  rt+        |j
                  |      |_        T|dk(  r|j                  xs ||_        p|dk(  sw|j                  xs ||_        t        fdt<        D              st>        jA                  dtC        dd      j                                 |j                  xs ||_        ||_"        |S )z)Search meta tags for relevant informationNz.//head/meta[@content]ri   r   rd   og:zarticle:tagrH   nameztwitter:app:namer[   itempropr2   r>   rX   c              3   :   K   | ]  }|j                   v  y wN)attrib).0keyr   s     rp   	<genexpr>zexamine_meta.<locals>.<genexpr>0  s     >CDKK'>s   zunknown attribute: %sFunicode)pretty_printencoding)#r   	from_dictr   allrU   r2   r   r>   re   r^   iterfindr   rk   r   rw   r   rv   
startswithappendrq   PROPERTY_AUTHORr   METANAME_IMAGEMETANAME_AUTHORMETANAME_TITLEMETANAME_DESCRIPTIONMETANAME_PUBLISHERTWITTER_ATTRSr   METANAME_TAG
EXTRA_METALOGGERdebugr   rL   )	r{   r|   rL   backup_sitenamecontent_attrproperty_attr	name_attritemprop_attrr   s	           @rp   examine_metar      s    z##$5d$;<H NNOOLL  NN	
	 /D 67 B&**2txx	2/FGMMO $ HHZ4::<M''.-N<89/1"3HOO\"R"55$,$5$5$E!.0!)!?<t{{",224IO+"3HOO\"Rn,!)!?<22'/';';'K|$00$,$5$5$E!m+/AY/N". ]*  .+l*N<894;;& HHZ4::<M("3HOO\"R-/'/';';'K|$*,!)!?< >:>>LL'EIFLLNBJ !))<_HHMOrr   expressions	len_limitc                    |D ]  } ||       }|D ]J  }t        dj                  |j                                     }|s.dt        |      cxk  r|k  sCn F|c c S  t        |      dkD  sit        j                  d|t        |              y)zExtract meta information    r   z#more than one invalid result: %s %sN)r    rl   itertextlenr   r   )r{   r   r   
expressionresultsr   r   s          rp   extract_metainfor   >  s    
 " 

T" 	D388DMMO45G1s7|7i7	 w<!LL5z3w<
 rr   c                     d}| j                  d      }|;t        |j                               }t        j	                  |      x}r||d   |d   fS t
        j                  d       |ddfS )z2Extract text segments out of main <title> element.ri   z.//head//titleNr   r   zno main title found)findr    text_contentHTMLTITLE_REGEXmatchr   r   )r{   rU   title_elementr   s       rp   examine_title_elementr   Q  sv     EII./M ]//12#))%0050%(E!H,,
LL&'$rr   c                    | j                  d      }t        |      dk(  r t        |d   j                               }|r|S t	        | t
              xs d}|r|S t        |       \  }}}||fD ]  }|sd|vs|c S  |r|d   j                         S 	 | j                  d      d   j                         }|S # t        $ r t        j                  d       Y |S w xY w)zExtract the document titlez.//h1r   r   ri   .z.//h2zno h2 title found)findallr   r    r   r   r%   r   r~   
IndexErrorr   r   )r{   
h1_resultsrU   firstsecondts         rp   extract_titler   _  s     g&J
:!Z]//12LT<06BE06E5&V_ AH !}))++*

7#A&335 L  *()L*s   "B5 5CCc                 x    t        t        |       t              }t        |t        d      }|rt        d|      }|S )zExtract the document author(s)x   )r   N)r   r   r!   r   r"   r   )r{   subtreer2   s      rp   extract_authorr   {  s5    "8D>3HIGg}DF"40Mrr   default_urlc                    t         D ]6  }| j                  |      }||j                  j                  d      nd}|s6 n r|j	                  d      r| j                  d      D ]n  }|j                  d      xs |j                  d      xs d}|j	                  d      s|j	                  d	      sNt        |j                  d
         }|si||z   } n |rt        |      \  }}|rt        |      nd}|xs |S )z'Extract the URL from the canonical linkNhref/z.//head//meta[@content]r   rd   ri   r   ztwitter:r   )	URL_SELECTORSr   r   r   r   r   r   r   r   )	r{   r   selectorelementr   attrtypebase_urlvalidation_result
parsed_urls	            rp   extract_urlr     s    ! ))H%,3,?gnn  (T	 s~~c"}}%>? 	G{{6*Kgkk*.EKH""5)X-@-@-L'y(AB"S.C	 (4S(9%:+<mJ'$+rr   c                 B    t        |       ^}}t        d |D        d      S )z=Extract the name of a site from the main title (if it exists)c              3   0   K   | ]  }|sd |v s|  yw)r   N )r   parts     rp   r   z#extract_sitename.<locals>.<genexpr>  s     @$4C4K@s   N)r   next)r{   _partss      rp   extract_sitenamer     s#    %d+IA@%@$GGrr   metatypec                 j   g }d| z   dz   | dk(  rt         nt        }|D ]&  }|j                  fd ||      D               |s& n | dk(  r6|s4|j                  d      D ]   }|j	                  |j
                  d          " t        j                  d |D              D cg c]  }|s|	 c}S c c}w )z!Find category and tag informationr   z	[s|ies]?/categoryc              3      K   | ]7  }t        j                  |j                  d          r|j                          9 yw)r   N)researchr   r   )r   r   regexprs     rp   r   z#extract_catstags.<locals>.<genexpr>  s9      
yy$++f"56 
s   =A zR.//head//meta[@property="article:section" or contains(@name, "subject")][@content]r   c              3   8   K   | ]  }|st        |        y wr   )r   )r   xs     rp   r   z#extract_catstags.<locals>.<genexpr>  s     $NAA_Q%7$Ns   )r#   r$   extendr~   r   r   r   r   )r   r{   r   xpath_expressioncatexprr   rr   s          @rp   extract_catstagsr     s    GHn{*G,4
,B(#  

 	

  :gzz`
 	6G NN7>>)45	6 }}$N$NNT!RSATTTs   !B0)B0r   strictc                     t         j                  | j                  dd            }|rd|d   j                          d|d    S | j                  r?|r(t
        j                  | j                        }|r|d   S dS t        | j                        S y)	zkProbe a link for identifiable free license cues.
    Parse the href attribute first and then the link text.r   ri   zCC r   r   r   r   N)LICENSE_REGEXr   r   upperr   TEXT_LICENSE_REGEXr    )r   r   r   s      rp   parse_license_elementr     s       VR!89EU1X^^%&aaz22||&--gll;E$58.$.GLL!!rr   c                     | j                  d      D ]  }t        |d      }||c S  | j                  d      D ]  }t        |d      }||c S  y)z:Search the HTML code for license information and parse it.z.//a[@rel="license"][@href]F)r   Nz[.//footer//a[@href]|.//div[contains(@class, "footer") or contains(@id, "footer")]//a[@href]T)r   r   r~   )r{   r   r   s      rp   extract_licenser     so     << => &wu=M
 ::e  'wt<M rr   filecontentdate_config	extensivec                 R   |xs
 t               }|xs t        |      }t        |       }|
t               S t	        |      }|j
                  rd|j
                  vrd|_        	 t        ||      }|j                  st        |      |_
        |j
                  r|rt        |j
                  |      |_        |j
                  st        |      |_        |j
                  r|rt        |j
                  |      |_        |j                  st        ||      |_        |j                  rt!        |j                  d      |_        |j                  |d<   t%        |fi ||_        |j(                  st+        |      |_        |j(                  rt-        |j(                  t.              r|j(                  d   |_        n4t-        |j(                  t0              rt3        |j(                        |_        |j(                  j5                  d      |_        |j(                  rd	|j(                  vrt|j(                  d   j7                         sW|j(                  j                         |_        n7|j                  r+t8        j;                  |j                        }|r
|d
   |_        |j<                  st?        d|      |_        |j@                  st?        d|      |_         tC        |      |_"        |d   |_#        |jI                          |S # t        $ r!}t        j                  d|       Y d}~d}~ww xY w)a  Main process for metadata extraction.

    Args:
        filecontent: HTML code as string or parsed tree.
        default_url: Previously known URL of the downloaded document.
        date_config: Provide extraction parameters to htmldate as dict().
        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.

    Returns:
        A trafilatura.settings.Document containing the extracted metadata information or None.
        The Document class has .as_dict() method that will return a copy as a dict.
    Nr   z%error in JSON metadata extraction: %sT)fastr   r   @r   r   r   tagmax_date)%setr   r   r   r   r2   r   	Exceptionr   warningrU   r   rz   r   r   r   r   hostnamer   datere   r   
isinstancelistr   strlstripisupperMETA_URLr   
categoriesr   rL   r   licensefiledateclean_and_trim)	r   r   r   r  rs   r{   r|   errmymatchs	            rp   extract_metadatar    s   ( (035;!;K [!D|z D!H 3hoo5E$T84
 >>&t, +'9IJ??(.+'9IJ <<"45 ||*8<<dC "Kd2k2HM ,T2h''. ( 1 1! 4H))40 #H$5$5 6H$--44S9 8,,,%%a(002 ( 1 1 7 7 9H	... '
H .z4@ ==(5 't,H $J/HOK  E>DDEs   "K< <	L&L!!L&)   r   )F)NNTN)___doc__r   loggingr   copyr   htmlr   typingr   r   r   r   r	   r
   r   courlanr   r   r   r   r   r&   r   
lxml.etreer   	lxml.htmlr   r   htmlprocessingr   json_metadatar   r   r   r   settingsr   r   utilsr   r   r   r    xpathsr!   r"   r#   r$   r%   __all__	getLogger__name__r   setLevelWARNINGcompiler  r   r   rj   r   Ir   r   r   r   r   r   METANAME_URLr   r   r   r   r   r   r   r  rq   rz   r   r   r   intr   r   r   r   r   r   r   boolr   r   r  r   rr   rp   <module>r1     s     	   ? ? ?    + 0  0 D D  ,			8	$   *  & &w 72::=>bjj01"**< "**X&

D  RZZ[DD 
&   ]+ -.!34 3
 #" -.	5 5 5
3 
#c( 
x} 
K 8   K Dhsm1C,D .^{ ^x ^D CF
$(K<?c]&

3x},-  8 # k  RU 6H; H8C= HUs U+ U$s) U6;  RU  + (3- & "&!%+/k{C'(k#k #k 	k
 s3x(k krr   