
    "#hhz                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZ ddlmZmZ d	d
lmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 d	dl:m;Z;m<Z<m=Z= d	dl>m?Z?m@Z@mAZAmBZB d	dlCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZL  ej                  eN      ZOdedePfdZQh dZRh dZSh dZTh dZUddhZVeUj                  eV      ZXh dZY ej                  d      Z[e'e(fe)e*ffZ\dePde?deeP   fdZ]dedePde?deeP   fdZ^dede?deeP   fdZ_d eeP   d!eeP   d"eeP   de?deeeP      f
d#Z`d$ePd%eeP   d!eeP   d"eeP   de?deeeP      fd&Za e	e;'      d(ebdePde?debfd)       Zcdede?deeP   fd*Zddede?deeP   fd+Zed,eeeP      dePfd-Zfd$ePde?deeP   fd.Zg	 	 	 	 	 	 	 	 d9d/eehePef   d0eid1eid2ePd3eeP   d4eid5eeeePf      d6eeeePf      d7eideeP   fd8Zjy):zZModule bundling all functions needed to determine the date of HTML strings
or LXML trees.
    N)Counter)deepcopy)datetime)	lru_cachepartial)MatchOptionalPatternUnionr   )HtmlElementtostring   )%discard_unwantedextract_url_dateidiosyncrasies_search
img_searchjson_searchregex_parsepattern_searchtry_date_exprDATE_EXPRESSIONSFAST_PREPENDSLOW_PREPENDFREE_TEXT_EXPRESSIONSMAX_SEGMENT_LENMIN_SEGMENT_LENYEAR_PATTERNYMD_PATTERNCOPYRIGHT_PATTERNTIMESTAMP_PATTERNTHREE_PATTERNTHREE_CATCHTHREE_LOOSE_PATTERNTHREE_LOOSE_CATCHSELECT_YMD_PATTERNSELECT_YMD_YEARYMD_YEARDATESTRINGS_PATTERNDATESTRINGS_CATCHSLASHES_PATTERNSLASHES_YEARYYYYMM_PATTERNYYYYMM_CATCHMMYYYY_PATTERNMMYYYY_YEARSIMPLE_PATTERNTHREE_COMP_REGEX_ATHREE_COMP_REGEX_BTWO_COMP_REGEX)
CACHE_SIZECLEANING_LISTMAX_POSSIBLE_CANDIDATES)	Extractor
clean_html	load_html	trim_text)	check_extracted_referencecompare_valuesfilter_ymd_candidateget_min_dateget_max_dateis_valid_dateis_valid_formatplausible_year_filtervalidate_and_convertelementreturnc                 :    t        | dd      j                         S )z,Format the element to be logged to a string.Funicodepretty_printencoding)r   strip)rD   s    J/var/www/html/sandstorm/venv/lib/python3.12/site-packages/htmldate/core.py	logstringrM   K   s    G%)DJJLL    >E   dc.datedc:date
bt:pubdate
dc.created
dc:created
og:pubdate
og:regdatedcterms.datepublish-datesailthru.datedc.date.issueddcterms.issuedpublished-datearticle.createddc.date.createddcterms.createdog:publish_dateog:datepublishedparsely-pub-datetwt-published-atarticle.publishedarticle:post_datearticle:publishedog:published_timevr:published_timevideo:release_datedc.date.publicationrnews:datepublishedfield-name-post-dateog:article:publishedarticle:published_datearticle:published_timearticle:publicationdateog:article:published_timeog:question:published_timecxenseparse:recs:publishtimedcsext.articlefirstpublishedanalyticsattributes.articledate"shareaholic:article_published_timedatemetapdateptimecreatedgentimepubdatedoc_date	rbpubdate	timestamp
dateposteddatecreateddisplaydatepublishdatedate_createdpublish_datepublish_timerelease_datecitation_datedatepublishedpublisheddatedate_publishedpublished_datepublished_timepublication_datecontent_create_datearticle_date_originalmediator_published_timeoriginalpublicationdatecitation_publication_date>   utimelastmodlastdatemodifiedlastmodifiedlast-modified>   
bt:moddatedc.modifiedog:updated_timearticle:modifieddcterms.modifiedog:modified_timearticle:modified_datearticle:modified_timearticle:post_modifiedog:article:modified_timedatemodifiedr   r   updated_timemodified_timerevision_datemodificationdate>   pubyearr   r   r   
dateupdate>   date-publishedtime published	publishedz\D+$textoptionsc                     t        |       } t        |       t        k  ryt        j	                  d| dt
               } t        | |j                  |j                  |j                  |j                        S )z'Prepare text and try to extract a date.N )r:   lenr   NON_DIGITS_REGEXsubr   r   format	extensiveminmax)r   r   s     rL   examine_textr      s`    
 T?D
4yO#D)9/$:;Dgnng//gkk rN   tree
expressionc                     | j                  |      }|rt        |      t        kD  ry|D ]=  }|j                         |j	                  dd      fD ]  }t        ||      }|s|c c S  ? y)z3Check HTML elements one by one for date expressionsNtitler   )xpathr   r6   text_contentgetr   )r   r   r   elementselemr   attempts          rL   examine_date_elementsr      st     zz*%Hs8}'>> &&($((7B*?@ 	D"41G	 rN   c                 	   d\  }}t        t        |j                  |j                  |j                  |j
                        }| j                  d      D ]U  }|j                  rd|j                  vrd|j                  vr-d|j                  v r|j                  dd      j                         }|dk(  rt        |j                  d      |      }n|t        v r8t        j                  d	t        |              ||j                  d            }n|t        v rt        j                  d	t        |             |j                   s ||j                  d            }nH ||j                  d            }n/d
|j                  v r|j                  d
d      j                         }|t        v s	|t"        v rt        j                  dt        |              ||j                  d            }||t        v r|j                   s|t"        v r|j                   s|}n|}nd|j                  v r"|j                  dd      j                         }|t$        v rzt        j                  dt        |              ||j                  d      xs |j                  d            }||t&        v r|j                   s|t(        v r|j                   s|}n|dk(  rt        j                  dt        |             d|j                  v rdj+                  |j                  dd      ddg      }t-        |d|j                  |j
                        rZ|}nVd|j                  v r[|j                  dd      j                         dk(  r$t        j                  dt        |              ||j                  d            }nd|j                  v r|j                  dd      j                         }|dk(  r[t        j                  dt        |             |j                   r ||j                  d            }nw ||j                  d            }n_|dk(  rZt        j                  dt        |             |j                   s ||j                  d            }n ||j                  d            }|V n ||t        j                  d       |}|S )a  
    Parse header elements to find date cues

    :param tree:
        LXML parsed tree object
    :type tree: LXML tree
    :param options:
        Options for extraction
    :type options: Extractor
    :return: Returns a valid date expression as a string, or None

    )NN)outputformatextensive_searchmin_datemax_datez.//metacontentr   namer   zog:urlzexamining meta name: %spropertyzexamining meta property: %sitempropzexamining meta itemprop: %scopyrightyear-01%Y-%m-%dearliestlatestr|   zexamining meta pubdate: %sz
http-equivrv   zexamining meta http-equiv: %sr   z-opting for reserve date with less granularity)r   r   r   r   r   r   iterfindattribr   lowerr   DATE_ATTRIBUTESLOGGERdebugrM   NAME_MODIFIEDoriginalPROPERTY_MODIFIEDITEMPROP_ATTRSITEMPROP_ATTRS_ORIGINALITEMPROP_ATTRS_MODIFIEDjoinr@   )r   r   
headerdatereservetryfuncr   	attributer   s           rL   examine_headerr      s     %J^^ **G i( V +$++-T[[ ,224IH$*488I+>Ho-6	$H$TXXi%89
m+6	$H''!())<!=J%dhhy&9:G4;;&R0668IO+y<M/M:IdOL!$((9"56&!_49I9I!%66w?O?O%,
 #*4;;&R0668IN*:IdOL!$((:"6"M$((9:MN&!%<<AQAQ!%<<WEUEU%,

 o-:IdOL+!hhB(?t'LMG$gkk'++ #*$++%xx	2&,,.);99T?K$TXXi%89
T[[(r288:IF"<ioN##!())<!=J%dhhy&9:Go-<ioN''!())<!=J%dhhy&9:G!mVp g1DE
rN   occurrencescatchyearpatc           
         | rt        |       t        kD  ryt        |       dk(  r#|j                  t        t	        |                   S | j                  d      }t        j                  d|       t        ||j                         dd }t        j                  d|       t        | \  }}g }|D ]*  }	|j                  |	      }
|
s|j                  |
d          , |D cg c]9  }t        t        t        |      dd      d|j                  |j                   	      ; }}t#        |      rf|d
   |d   k(  r|j                  |d
         }|S |d   |d
   k7  r$|d   |d
   z  dkD  r|j                  |d         }|S |j                  |d
         }|S t%        |      r%|j                  ||j'                  d               }|S t        j                  d|d
   |d          d}|S c c}w )z2Select a candidate among the most frequent matchesNr   
   zfirstselect: %s)reverse   zbestones: %s%Yr   r   g      ?Tzno suitable candidate: %s %s)r   r6   searchnextitermost_commonr   r   sortedr   zipappendr@   r   intr   r   allanyindex)r   r   r   r   firstselectbestonespatternscountsyearspattern
year_matchyear
validationmatchs                 rL   select_candidater  c  s    #k*-DD
;1||Dk!2344 ))"-K
LL"K0kw/?/?+?@!DH
LL* H~HfE (^^G,
LLA'( 	  	SY1%tgkk'++	
J  :!9q	!LL!-E L 1Xq!fQi&)&;c&ALL!-E L LL!-E L 
ZXj&6&6t&<=> L 	3U1XuQxHL/s   >G!
htmlstringr   c                 f    t        | |||j                  |j                        }t        ||||      S )z)Chained candidate filtering and selectionr   r   r   r   )rB   r   r   r  )r  r   r   r   r   
candidatess         rL   search_patternr	    s8     '{{J Jw@@rN   )maxsize	referencec                     t        ||j                  |j                  |j                  |j                        }|t        | ||      S | S )z[Compare candidate to current date reference (includes date validation and older/newer test))r   r   r   r   r   r<   )r  r   r   r   s       rL   compare_referencer    sH     GNNG$5$5w{{GKKG i'::rN   c                    | j                  d      }dt        |      cxk  r
t        k  rn y	d}|D ]  }d|j                  v ra	 t	        |j                  dd            }t        j                  d|       |j                  r|dk(  s||k  r|}]|j                  rj||kD  sp|}s|j                  d      t        v sd|j                  v r|j                  d      }t        j                  d|       |j                  r?t        ||j                  |j                  |j                  |j                        }||c S t!        |||      }|dkD  s nb|j"                  s/t        |j"                        d
kD  sIt        j                  d|j"                         t!        ||j"                  |      } t%        ||      xs t'        | d|      S y	# t        $ r Y w xY w)zTScan the page for abbr elements and check if their content contains an eligible datez.//abbrr   z
data-utimer   zdata-utime found: %sclassr   zabbr published-title found: %sNr   zabbr published found: %s)findallr   r6   r   r   r   
ValueErrorr   r   r   CLASS_ATTRSr   r   r   r   r   r  r   r;   r   )r   r   r   r  r   	candidatetrytextr   s           rL   examine_abbr_elementsr    s   
 ||I&H3x=222` _ 	 '	QDt{{* #DHH\2$> ?I 3Y?##a9y;P )I ))i).C )I'"k1dkk)"hhw/GLL!A7K''"/##NN#--#KK#KK# #.#*N$5i'$R	$q=!YY3tyy>B#6LL!;TYYG 1)TYY PIO'	QR )G< 
@UA
 	

 S " s   G	GGc                 F   | j                  d      }dt        |      cxk  r
t        k  rn yd}|D ]  }d}|j                  dd      }t        |      dkD  r[d|j                  v r9|j                  d      dk(  r%|j
                  rd}t        j                  d	|       nd
|j                  v r|j
                  r[|j                  d
d      j                  d      s!|j                  d
d      j                  d      rd}t        j                  d|       nO|j
                  sC|j                  d
      dk(  r/d}t        j                  d|       nt        j                  d|       |r?t        ||j                  |j                  |j                  |j                        }|o|c S t        |||      }|j                  t        |j                        dkD  st        j                  d|j                         t        ||j                  |      } t!        ||      S y)zTScan the page for time elements and check if their content contains an eligible datez.//timer   Fr   r      r|   Tz#shortcut for time pubdate found: %sr  z
entry-datez
entry-timez$shortcut for time/datetime found: %supdatedz,shortcut for updated time/datetime found: %sztime/datetime found: %sNztime/datetime found in text: %s)r  r   r6   r   r   r   r   r   
startswithr   r   r   r   r   r  r   r;   )r   r   r   r  r   shortcut_flagdatetime_attrr   s           rL   examine_time_elementsr    s   
 ||I&H3x=222p m 	 1	MD!M HHZ4M=!A% ,+y8(($(MLL!FV+''"-88F88GR0;;LI(,BM %--$((72Cy2P(,J) LL!:MJ +%))G *& 1)]G TI&3tyy>A+=>		J-iGL	c1	Mh )G<<rN   r  c                     d | j                         D        \  }}}t        |      dk(  r|d   dk(  rd| nd| }| d| d| S )zoNormalize string output by adding "0" if necessary,
    and optionally expand the year from two to four digits.c              3   D   K   | ]  }|s|j                  d         yw)r   N)zfill).0gs     rL   	<genexpr>z"normalize_match.<locals>.<genexpr>8  s     @qa
@s     r   r   91920r   )groupsr   )r  daymonthr  s       rL   normalize_matchr)  5  s[     AELLN@C
4yA~"1gnD6{Btf+V1UG1SE""rN   c           
         t         j                  d       d}t        | t        t        t        |      }|Tt        |d         }t        t        |dd      d|j                  |j                        rt         j                  d|       |}t         j                  d       t        D ]]  }t        | |d   |d   t        |      }t        ||d   |j                  ||j                  |j                  |j                        }|[|c S  t        | t        t         |j                  |j                  	      }i }|D ]*  }	t#        j$                  |	      }
t'        |
      }||	   ||<   , t)        |      }t+        |t,        t.        |      }t        |t        |j                  ||j                  |j                  |j                        }||S t        | t0        t2        t        |      }t        |t0        |j                  ||j                  |j                  |j                        }||S t        | t4        t6        |j                  |j                  d
      }i }|D ]*  }	t9        j$                  |	      }
t'        |
      }||	   ||<   , t)        |      }t+        |t,        t.        |      }t        |t4        |j                  ||j                  |j                  |j                        }||S t         j                  d       t        | t:        t<        t        |      }|t        t        |d         t        |d         d      }|dk(  s|j>                  |k\  rStA        ||j                  |j                  |j                        }|$t         j                  dt:        |d   |d          |S t        | tB        tD        |j                  |j                  |j                        }i }|D ]N  }	tG        j$                  |	      }
|
d   }tI        |      dk(  rd| }djK                  |
d   |dg      }||	   ||<   P t)        |      }t+        |t,        t.        |      }t        |tB        |j                  ||j                  |j                  |j                        }||S tM        |       }|dk(  s|r@|j>                  |k\  r1tA        ||j                  |j                  |j                        }||S |dk7  rFt         j                  d       t        t        |      dd      }|jO                  |j                        S t         j                  d       t        | tP        t        t        |      }|t        t        |d         dd      }t        |d|j                  |j                        rH|j>                  |k\  r9t         j                  dtP        |d          |jO                  |j                        S y)a  
    Opportunistically search the HTML text for common text patterns

    :param htmlstring:
        The HTML document in string format, potentially cleaned and stripped to
        the core (much faster)
    :type htmlstring: string
    :param options:
        Define extraction options
    :type options: Extractor
    :return: Returns a valid date expression as a string, or None

    z(looking for copyright/footer informationr   Nr   r   r   z'copyright year/footer pattern found: %sz3 componentsr  T)r   r   r   r   
incompletezswitching to two componentsr   z#date found for pattern "%s": %s, %s0r   r   zusing copyright year as defaultzswitching to one componentr   zdate found for pattern "%s": %s))r   r   r	  r   r   r   r@   r   r   r   THREE_COMP_PATTERNSr=   r   r   rB   r%   r&   r1   r  r)  r   r  r   r'   r(   r)   r*   r+   r2   r,   r-   r  rC   r.   r/   r3   r   r   r   strftimer0   )r  r   copyear	bestmatchr  r   resultr  replacementitemr  r  
dateobjectr(  s                 rL   search_pager5  >  s     LL;<GI 9Q< T1a $W[[
 LLBDIG LL  ( "QKQK
	 &QKNNKKKK
 M%* '"{{J K 2"((.#E*	!+D!1I2 %J [(GLI!F  I "F  '{{J K 2"((.#E*	!+D!1I2 %J [(GLI!F  LL./I c)A,/Yq\1BAF
a<:??g5)GNNW[[F !9"aLaL	  '{{##J K 2$$T*au:?wKEHHeAht45	!+D!1I2 %J [(GLI!F  Z(J!|
z''A%W[[
 M !|67c'lAq1
""7>>22 LL-.I c)A,/A6
JW[[ 7*LL1>9Q< &&w~~66rN   
htmlobjectr   original_dater   urlverboser   r   deferred_url_extractorc	                    |r$t        j                  t         j                         t        |       }	|	y|dk7  rt	        |      syt        |t        |      t        |      ||      }
d}|$|	j                  d      }||j                  d      }t        ||
      }||s|S t        |	|
      xs t        |	|
      }||S |r||S t        |	|
      }||S 	 t        t        t!        |	      t"                    \  }}|rt*        t,        z   }nt.        t,        z   }t1        |||
      xs t1        |d|
      xs t3        ||
      }||S 	 t5        |dd	
      }t;        |t<        |
      xs t?        ||
      xs tA        ||
      }||S |rt&        jC                  d       d}tE        |      D ]>  }|jG                         }tH        tK        |      cxk  r	tL        k  sn 2tO        |||
      }@ tQ        ||
      }|xs tS        ||
      S y# t$        $ r |	}t&        j)                  d       Y 0w xY w# t6        $ r! t5        |d      j9                  dd      }Y w xY w)a  
    Extract dates from HTML documents using markup analysis and text patterns

    :param htmlobject:
        Two possibilities: 1. HTML document (e.g. body of HTTP request or .html-file) in text string
        form or LXML parsed tree or 2. URL string (gets detected automatically)
    :type htmlobject: string or lxml tree
    :param extensive_search:
        Activate pattern-based opportunistic text search
    :type extensive_search: boolean
    :param original_date:
        Look for original date (e.g. publication date) instead of most recent
        one (e.g. last modified, updated time)
    :type original_date: boolean
    :param outputformat:
        Provide a valid datetime format for the returned string
        (see datetime.strftime())
    :type outputformat: string
    :param url:
        Provide an URL manually for pattern-searching in URL
        (in some cases much faster)
    :type url: string
    :param verbose:
        Set verbosity level for debugging
    :type verbose: boolean
    :param min_date:
        Set the earliest acceptable date manually (ISO 8601 YMD format)
    :type min_date: datetime, string
    :param max_date:
        Set the latest acceptable date manually (ISO 8601 YMD format)
    :type max_date: datetime, string
    :param deferred_url_extractor:
        Use url extractor as backup only to prioritize full expressions,
        e.g. of the type `%Y-%m-%d %H:%M:%S`
    :type deferred_url_extractor: boolean
    :return: Returns a valid date expression as a string, or None
    )levelNr   z.//link[@rel="canonical"]hrefzlxml cleaner errorz.//title|.//h1FrG   rH   )rI   zutf-8ignorezextensive search startedr   )*loggingbasicConfigDEBUGr9   rA   r7   r?   r>   findr   r   r   r   r  r   r8   r   r5   r  r   errorr   r   r   r   r  r   UnicodeDecodeErrordecoder   r    r   r   r   r   rK   r   r   r   r  r;   r5  )r6  r   r7  r   r8  r9  r   r   r:  r   r   
url_resulturlelemr1  abbr_resultsearch_tree	discarded	date_exprr  r  segment	converteds                         rL   	find_daterN  (  s   d '--0Z D |z!/,*G XXG J
{))78++f%C "#w/J&< D'*Hk$.HF *"8 (K +!1x~}5"
Y  #33	 #33	
 		
 
	7
 !

	7 !g6  Yk	R
 	z#4g> 	6k7+	6 W5 
  /0	,[9 	GGmmoG"S\COC))WgFI		G
 .iA	<K
G<<E  +)*+L  Yk>EEgxX
Ys$   %H H/  H,+H,/&II)TFr   NFNNF)k__doc__r?  recollectionsr   copyr   r   	functoolsr   r   typingr   r	   r
   r   Counter_Type	lxml.htmlr   r   
extractorsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   settingsr4   r5   r6   utilsr7   r8   r9   r:   
validatorsr;   r<   r=   r>   r?   r@   rA   rB   rC   	getLogger__name__r   strrM   r   r   r   r   r   unionr   r  compiler   r-  r   r   r   r  r	  r   r  r  r  r)  r5  bytesboolrN   rN   rL   <module>rc     s    	    ( K K +& & & & & & & & & &N I H > >
 
 
 
		8	$M{ Ms M
HV * F )<8 (../FG?2::g&  K +, 
 c] 
  c]	(u
uu c]up4c"43<4 S\4 	4
 eCj4nAAS\A 3<A S\	A
 A eCjA$ :  		 6
66 c]6r>
>> c]>B#8E#J/ #C #gC g) g gX ""/3/3#(oeS+-.oo o 	o
 
#o o uXs]+,o uXs]+,o !o c]orN   