
    (#h2              	          d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZmZ h dZh d	Zh d
Z ej(                  dej*                        Z ej(                  dej*                        Z ej(                  d      Z ej(                  dej*                        Z ej(                  dej*                        Z ej(                  dej*                        Z ej(                  dej8                        Z ej(                  d      Z ej(                  dej8                        Z ej(                  d      Z dZ! ej(                  dej*                        Z" ej(                  dej*                        Z#de"fde#fgZ$ ej(                  dej8                        Z% ej(                  d      Z& ej(                  d      Z' ej(                  d      Z( ej(                  d      Z) ej(                  d       Z* ej(                  d!ej8                        Z+ ej(                  d"      Z, ej(                  d#ej8                        Z- ej(                  d$ej\                        Z/d8d%ed&ed'e	e0   d(e1fd)Z2d*ed%ed(efd+Z3d,eee   ee0e0f   f   d%ed(efd-Z4d.e0d/e
e0   d(e	e0   fd0Z5d1e0d%ed(efd2Z6d3e0d(e0fd4Z7d5e	e0   d6e0d(e	e0   fd7Z8y)9z
Functions needed to scrape metadata from JSON-LD format.
For reference, here is the list of all JSON-LD types: https://schema.org/docs/full.html
    N)unescape)AnyDictListOptionalPatternUnion   )Document)HTML_STRIP_TAGStrim>
   articleblogpostingnewsarticleliveblogpostingscholarlyarticleopinionnewsarticlesocialmediapostingreportagenewsarticlebackgroundnewsarticlemedicalscholarlyarticle>!   blogqapagereportr   faqpagewebpagewebsiteitempage	aboutpage
jobpostingr   contactpager   profilepagetecharticlecheckoutpagecollectionpagemedicalwebpager   satiricalarticler   realestatelistingreviewnewsarticlesearchresultspager   r   analysisnewsarticleaskpublicnewsarticler   r   discussionforumpostingr   advertisercontentarticle>   r   r   organizationnewsmediaorganizationzM"author":[^}[]+?"name?\\?": ?\\?"([^"\\]+)|"author"[^}[]+?"names?".+?"([^"]+)z$"[Pp]erson"[^}]+?"names?".+?"([^"]+)z`,?(?:"\w+":?[:|,\[])?{?"@type":"(?:[Ii]mageObject|[Oo]rganization|[Ww]eb[Pp]age)",[^}[]+}[\]|}]?z,"publisher":[^}]+?"name?\\?": ?\\?"([^"\\]+)z"@type"\s*:\s*"([^"]*)"z"articleSection": ?"([^"\\]+)z"author":|"person":)flagsz<[^>]+>z^https?://schema\.orgz\\u([0-9a-fA-F]{4}))	givenNameadditionalName
familyNamez*"@type":"[Aa]rticle", ?"name": ?"([^"\\]+)z"headline": ?"([^"\\]+)z"name"z
"headline"uB   ^([a-zäöüß]+(ed|t))? ?(written by|words by|words|by|von|from) z\d.+?$z@[\w]+z[._+]u$   ["‘({\[’\'][^"]+?[‘’"\')\]}]u   [^\w]+$|[:()?*$#!%/<>{}~¿]u;   \b\s+(am|on|for|at|in|to|from|of|via|with|—|-|–)\s+(.*)z3\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\bz"/|;|,|\||&|(?:^|\W)[u|a]nd(?:$|\W)u>   [✀-➾😀-🙏☀-⛿🌀-🗿🤀-🧿🩰-🫿🚀-🛿]+metadata	candidatecontent_typereturnc                     |r|t        |t              rl| j                  r&t        | j                        t        |      k  r|dk7  ry| j                  r-| j                  j	                  d      r|j	                  d      syy)z6Determine if the candidate should be used as sitename.r   ThttpF)
isinstancestrsitenamelen
startswith)r5   r6   r7   s      V/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/json_metadata.pyis_plausible_sitenamerA   9   si    Z	3/  S):):%;c)n%LQ]ajQj!2!2!=!=f!EiNbNbciNj    parentc                    t        d|       D ]  }d|v rd|d   v r|d   d   |_        d|vs|d   s&t        |d   t              r|d   d   n|d   }|j	                         }|t
        v r|j                  st        |      |_        |t        v rM|j                  d      xs$ |j                  d      xs |j                  d      }t        |||      s||_        |dk(  rG|j                  d      s|d   j                  d	      rt        |j                  |d         |_        |t        v s%d
|v r|d
   }t        |t              r	 t!        j"                  |      }t        |t              s|g}|D ]  dvs	d   dk(  sd}dv r\j                  d      }t        |t              r!dj'                  |      j)                  d      }n@t        |t*              r0d|v r,|d   }n&dv r"dv rdj'                  fdt,        D              }t        |t              st        |j                  |      |_         |j.                  s@d|v r<t        |d   t              r|d   g|_        nt        t        d|d               |_        |j0                  rd|v r|dk(  r|d   |_        d|v s|d   |_         |S # t         j$                  $ r t        |j                  |      |_        Y w xY w)z3Find and extract selected metadata from JSON parts.N	publishername@typer   	legalNamealternateNamepersonr:   authorPerson; r2   r4    c              3   2   K   | ]  }|v s|     y wN ).0xrK   s     r@   	<genexpr>z!process_parent.<locals>.<genexpr>x   s     2bVW[aVa6!92bs   	
articleSectionr   headline)filterr=   r;   listlowerJSON_OGTYPE_SCHEMApagetypenormalize_jsonJSON_PUBLISHER_SCHEMAgetrA   r?   normalize_authorsrK   JSON_ARTICLE_SCHEMAr<   jsonloadsJSONDecodeErrorjoinstripdictAUTHOR_ATTRS
categoriestitle)rC   r5   contentr7   r6   list_authorsauthor_namerK   s          @r@   process_parentrm   C   s   $' D9'!f0D&D ' 4V <H'!)9 /99I4.Pww'*V]^eVf#))+ --h6G6G .| <H00F+gw{{;/Gg7;;WfKgI$Xy,G$-!X%{{6"76?+E+Ef+M"3HOOWV_"U007"&x0lC0['+zz,'?
 ",5$0>L* ^Ff,w80K&*!V+*0**V*<K)+t<.2ii.D.J.J4.P!+K!>6[CX.9&.A(F2|v7M*-((2bl2b*bK%k37.?Q\.]HO^" &&+;w+Fg&67=+23C+D*EH'*.vdGDT<U/V*WH' >>W$)B%,V_HN7*%,Z%8HNID9J OI  // [*;HOO\*Z[s   K.K43K4schemac                    t        | t              r| g} | D ]  }|j                  d      }|st        |t              s(t        j                  |      s>d|v rt        |d   t              r|d   n|d   g}nQd|v rKt        |d   t              r8d|d   j                         v r#d|v rt        |d   t              r|d   n|d   g}n| }t        ||      } |S )z,Parse and extract metadata from JSON-LD dataz@contextz@graphrG   r   liveBlogUpdate)	r;   rf   r^   r<   JSON_SCHEMA_ORGmatchrX   rY   rm   )rn   r5   rC   contexts       r@   extract_jsonrt      s    &$ 8**Z(z'3/O4I4I'4R6!-7x8H$-O)V\]eVfUgF"z&/3'GL]aghoapavavaxLx  ~N  RX  ~X5?GW@XZ^5_ 01flm}f~e%fh7H8 OrB   elemtextregular_expressionc                     d}|j                  |       }|rEd|d   v r>t        ||d         }|j                  d| d      } |j                  |       }|rd|d   v r>|xs dS )z.Crudely extract author names from JSON-LD dataNrN   r
    )count)searchr_   sub)ru   rv   authorsmymatchs       r@   extract_json_authorr~      sx    G ''1G
cWQZ'#GWQZ8%))#xq)A$++H5 cWQZ' ?drB   elemc                    t         j                  d|       }t        |t              xs t        |t              }|r||_        d| v rBt        j                  |       }|r+t        |d   j                               }|t        v r||_        d| v r?t        j                  |       }|r(d|d   vr!t        |d         }t        ||      r||_        d| v r+t        j                  |       }|rt        |d         g|_        t"        D ]@  \  }}|| v s|j$                  r|j                  |       }|s,t        |d         |_         |S  |S )z*Crudely extract metadata from JSON-LD datarx   rG   r
   z"publisher",z"articleSection")JSON_AUTHOR_REMOVEr{   r~   JSON_AUTHOR_1JSON_AUTHOR_2rK   	JSON_TYPErz   r\   rY   rZ   r[   JSON_PUBLISHERrA   r=   JSON_CATEGORYrh   JSON_SEQri   )r   r5   element_text_authorrK   r}   r6   keyregexs           r@   extract_json_parse_errorr      sQ    -00T: !4mD E !4mD   $""4(&wqz'7'7'9:I..$-!  ''-s'!*,&wqz2I$Xy9$-! T!&&t,#1'!*#=">H  
U$;x~~ll4(G!/
!;O OrB   stringc                     d| v rl| j                  dd      j                  dd      j                  dd      } t        j                  d |       } dj                  d | D              } t	        |       } t        t        j                  d|             S )z-Normalize unicode strings and trim the output\z\nrx   z\rz\tc                 2    t        t        | d   d            S )Nr
      )chrint)rr   s    r@   <lambda>z normalize_json.<locals>.<lambda>   s    Ca"<M8N rB   c              3   Z   K   | ]#  }t        |      d k  st        |      dkD  s | % yw)i   i  N)ordrR   cs     r@   rT   z!normalize_json.<locals>.<genexpr>   s$     Qqc!fvoQ&Qs   !++)replaceJSON_UNICODE_REPLACEr{   rd   r   r   JSON_REMOVE_HTML)r   s    r@   r\   r\      s~    v~r*225"=EEeRP%))*NPVWQFQQ&! $$R011rB   current_authorsauthor_stringc                 j   g }|j                         j                  d      st        j                  |      r| S | | j	                  d      }d|v r|j                         j                  d      }d|v sd|v rt        |      }t        j                  d|      }t        j	                  |      D ]S  t              t        j                  d      t        j                  d      t        t        j                  d            t        j                  d      t         j                  d      t"        j                  d      t$        j                  d      t&        j                  d      rt)              d	k\  r	dvrd
vrd   j+                         rt-        d D              dk  rj/                         |vst)        |      dk(  st1        fd|D              sC|j3                         V t)        |      dk(  r| S dj5                  |      j7                  d      S )z3Normalize author info to focus on author names onlyr:   rM   z\uunicode_escapez&#z&amp;rx   rN   2   -r   c              3   B   K   | ]  }|j                         sd   yw)r
   N)isupperr   s     r@   rT   z$normalize_authors.<locals>.<genexpr>  s     )Kqyy{!)Ks   r
   c              3   &   K   | ]  }|v 
 y wrP   rQ   )rR   
new_authorrK   s     r@   rT   z$normalize_authors.<locals>.<genexpr>  s     F~dnzY_G_F~s   )rY   r?   AUTHOR_EMAILrr   splitencodedecoder   r   r{   AUTHOR_SPLITr   AUTHOR_EMOJI_REMOVEAUTHOR_TWITTERAUTHOR_REPLACE_JOINAUTHOR_REMOVE_NICKNAMEAUTHOR_REMOVE_SPECIALAUTHOR_PREFIXAUTHOR_REMOVE_NUMBERSAUTHOR_REMOVE_PREPOSITIONr>   r   sumri   allappendrd   re   )r   r   new_authorsrK   s      @r@   r_   r_      s   K''/<3E3Em3T"%++D1%,,.556FG}= 8 /#''M:M$$]3 'f$((V4##B/)--c6:;'++B7&**2v6""2v.&**2v6*..r6: #f++60AcQWFWay  "c)KV)K&Ka&O\\^F$#k*:a*?3F~r}F~C~v&/'0 ;199[!''--rB   rP   )9__doc__ra   rehtmlr   typingr   r   r   r   r   r	   settingsr   utilsr   r   r`   rZ   r]   compileDOTALLr   r   r   r   r   r   
IGNORECASE
JSON_MATCHr   rq   r   rg   	JSON_NAMEJSON_HEADLINEr   r   r   r   r   r   r   r   r   r   UNICODEr   r<   boolrA   rm   rt   r~   r   r\   r_   rQ   rB   r@   <module>r      s  
  	  < <  ( _  [	 W 

kmomvmvw

BBIINRZZ  !D  E KRYYWBJJ1299=	

;RYYGRZZ.bmmD
2::j) "**5R]]K!rzz"89 <BJJDbiiP	

5ryyAy!L-#@A

`hjhuhuv"

9- I& bjj* #$KL "

#AB &BJJ'emomzmz{ rzzPQrzz?r}}U bjj	 

	 H  HUXM ei G3 G( Gx GTtCy$sCx.89 X RZ ,# 73< HUXM '3 '( 'x 'T23 23 2*.x} *.S *.XVY] *.rB   