
    (#h%                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ  ej4                  e      Zh dZ ej<                  d      Z ej<                  d      Z  ej<                  d      Z! ej<                  d      Z" ej<                  d      Z# ej<                  d      Z$ G d d      Z%de&de'fdZ(de	e&   de%de	e&   fdZ)de&de%de	e&   fdZ*de&de%de	e&   fdZ+de&de%de	e&   fdZ,de%de
e&   de	e&   fd Z-	 	 	 d(d!e&d"e
e&   d#e'd$e.de	e&   f
d%Z/d&e&d"e
e&   de	e&   fd'Z0y))z>
Examining feeds and extracting links for further processing.
    N)islice)sleep)ListOptional)	check_url	clean_urlfilter_urlsfix_relative_urlsget_hostinfois_valid_url   )is_similar_domain)	fetch_url)	MAX_LINKS)	load_html>   text/rdftext/rsstext/xml	text/atom
text/plaintext/rdf+xmltext/rss+xmltext/atom+xmlapplication/rdfapplication/rssapplication/xmlapplication/atomapplication/jsonapplication/rdf+xmlapplication/rss+xmlapplication/atom+xmlapplication/feed+jsonapplication/x-atom+xmlapplication/x.atom+xmlz<(feed|rss|\?xml)z<link .*?href=".+?"zhref="(.+?)"z:<link>(?:\s*)(?:<!\[CDATA\[)?(.+?)(?:\]\]>)?(?:\s*)</link>z\bcomments\bzn\.(?:atom|rdf|rss|xml)$|\b(?:atom|rss)\b|\?type=100$|feeds/posts/default/?$|\?feed=(?:atom|rdf|rss|rss2)|feed$c                   D    e Zd ZdZg dZ	 	 ddededededee   d	dfd
Zy)FeedParametersz.Store necessary information to proceed a feed.basedomainextlangrefNbaseurlr)   	referenceexternaltarget_langreturnc                 J    || _         || _        || _        || _        || _        y )Nr'   )selfr-   r)   r.   r/   r0   s         N/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/feeds.py__init__zFeedParameters.__init__M   s'     !	!!#.	!    )FN)	__name__
__module____qualname____doc__	__slots__strboolr   r5    r6   r4   r&   r&   I   sU    48I %)"" " 	"
 " c]" 
"r6   r&   feed_stringr1   c                 L    t         j                  |       ry| dd }d|v xs d|v S )z$Check if the string could be a feed.TNd   z<rssz<feed)FEED_OPENINGmatch)r?   	beginnings     r4   is_potential_feedrE   \   s4    +&DS!IY6'Y"66r6   linklistparamsc                    g }t        t        |             D ]  }t        |j                  |      }t	        ||j
                        }|c|j                  sBd|vr>t        |j                  |d         s%t        j                  d|j                  |d          |j                  |d          d|v sd|v s|j                  |        |S )zGExamine links to determine if they are valid and
    lead to a web page)languagefeedr   z'Rejected, diverging domain names: %s %sr   
feedburner	feedproxy)sortedsetr
   r(   r   r+   r*   r   r)   LOGGERwarningappend)rF   rG   output_linksitemlinkcheckeds         r4   handle_link_listrV   d   s     Ls8}% & d3D6;;7JJ$&)&--D=v}}gVWj ##GAJ/T![D%8%#&& r6   c                 `   t        |       s| j                  d      rh	 t        j                  |       j	                  dg       D cg c]&  }|j	                  d      xs |j	                  d      ( }}|D cg c]  }||	 c}S t        j                  d|j                         g S d| v rVd t        t        j                  |       t              D        D cg c]"  }d	|vrd
|vrt        j                  |      d   $ c}S d| v rQt        t         j                  | t"        j$                        t              D cg c]  }|d   j'                          c}S g S c c}w c c}w # t        j
                  j                  $ r$ t        j                  d|j                         Y g S w xY wc c}w c c}w )z<Try different feed types and return the corresponding links.{itemsurlidzJSON decoding error: %szPossibly invalid feed: %sz<link c              3   &   K   | ]	  }|d      yw)r   Nr>   ).0ms     r4   	<genexpr>zfind_links.<locals>.<genexpr>   s      !s   zatom+xmlz
rel="self"r   z<link>)rE   
startswithjsonloadsgetdecoderJSONDecodeErrorrO   debugr)   r   
LINK_ATTRSfinditerr   	LINK_HREFsearchLINK_ELEMENTSreDOTALLstrip)r?   rG   rS   
candidatescrT   r^   s          r4   
find_linksrq      s   [)!!#&G !%

; 7 ; ;GR H HHUO5txx~5
  $.?a?? LL4fmmD	 ;$Z%8%8%EyQ
 %,d*B	 T"1%
 	
 ; M22;		JIV
 aDJJL
 	

 I= @<<// G6F 		G

sA   'E" +E0E" 6E>EE" 'F&9F+
E" "=F#"F#c                    | s"t         j                  d|j                         g S t        | j	                         |      }t        ||      D cg c]'  }||j                  k7  r|j                  d      dkD  r|) }}|r+t         j                  dt        |      t        |             |S t         j                  d|j                         |S c c}w )z7Extract and refine links from Atom, RSS and JSON feeds.zEmpty feed: %s/   z!Links found: %s of which %s validzInvalid feed for %s)	rO   rf   r)   rq   rn   rV   r,   countlen)r?   rG   
feed_linksrT   rR   s        r4   extract_linksrx      s    %v}}5	K--/8J %Z86::$**S/A"5 	L  /Z#lBS	
  	*FMM:s   ,C

htmlstringc           	      N   t        |       }|"t        j                  d|j                         g S |j	                  d      D cg c]P  }|j                  d      t        v s%t        j                  |j                  dd            r|j                  dd      R }}|sS|j	                  d      D cg c]9  }t        j                  |j                  dd            r|j                  dd      ; }}g }t        j                  |      D ]i  }t        |j                  |      }t        |      }|s'||j                  k7  s7t        |      sCt        j                  |      rY|j!                  |       k t        j                  dt#        |      t#        |             |S c c}w c c}w )zxParse the HTML and try to extract feed URLs from the home page.
    Adapted from http://www.aaronsw.com/2002/feedfinder/zInvalid HTML/Feed page: %sz//link[@rel="alternate"][@href]typehref z
//a[@href]z%Feed URLs found: %s of which %s valid)r   rO   rf   r(   xpathrc   
FEED_TYPESLINK_VALIDATION_RErj   dictfromkeysr
   r   r,   r   	BLACKLISTrQ   rv   )ry   rG   treerT   	feed_urlsoutput_urlss         r4   determine_feedr      sv    Z D|16;;?	
 JJ@A88Fz)$$TXXfb%9: 	I   

<0
!((&")=> HHVR 
	 
 Ki( 	% d3

"T"$$T*t$	% LL/Y[AQ A
s   AF/>F"	urlfilterc                     | j                   rlt        d| j                   d| j                    d      }|rDt        ||       }t	        ||      }t
        j                  dt        |      | j                         |S g S )z2Alternative way to gather feed links: Google News.z*https://news.google.com/rss/search?q=site:z&hl=z&scoring=n&num=100z!%s Google news links found for %s)r+   r   r)   rx   r	   rO   rf   rv   )rG   r   
downloadedrw   s       r4   probe_gnewsr      ss    {{8tFKK=Xjk

 &z6:J$Z;JLL3S_fmm Ir6   rZ   r0   r/   
sleep_timec                 ~   t        |       \  }}|t        j                  d|        g S t        ||| ||      }d}t	        |       }|t        ||      }	|	sVt        ||      D ]+  }
t	        |
      }|s|	j                  t        ||             - t        |       t        |      dz   kD  r| }|	r.t        |	|      }	t        j                  dt        |	      |       |	S t        j                  d|        nAt        j                  d|        | j                  d      |k7  rt        |       t        ||      S t        ||      S )a  Try to find feed URLs.

    Args:
        url: Webpage or feed URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
                     (two-letter string, ISO 639-1 format).
        external: Similar hosts only or external URLs
                  (boolean, defaults to False).
        sleep_time: Wait between requests on the same website.

    Returns:
        The extracted links as a list (sorted list of unique links).

    NzInvalid URL: %srt   z%s feed links found for %szNo usable feed links found: %szCould not download web page: %srs   )r   rO   rP   r&   r   rx   r   extendrv   r	   rf   errorrn   r   try_homepager   )rZ   r0   r/   r   r)   r-   rG   r   r   rw   rJ   r?   s               r4   find_feed_urlsr      s/   * #3'OFG~(#.	GVS(KHFI3J":v6
&z6: J'o%%mK&HIJ
 3x#g,**	$Z;JLL5s:O5s;6<99S>W$*55vy))r6   r-   c                 F    t         j                  d|        t        | |      S )zhShift into reverse and try the homepage instead of the particular feed
    page that was given as input.z&Probing homepage for feeds instead: %s)rO   rf   r   )r-   r0   s     r4   r   r   4  s      LL97C';//r6   )NFg       @)1r:   ra   loggingrl   	itertoolsr   timer   typingr   r   courlanr   r   r	   r
   r   r   deduplicationr   	downloadsr   settingsr   utilsr   	getLoggerr7   rO   r   compilerB   rg   ri   rk   r   r   r&   r<   r=   rE   rV   rq   rx   r   r   floatr   r   r>   r6   r4   <module>r      s     	   !  -    			8	$
, rzz./RZZ./
BJJ'	

A BJJ'	RZZ " "&73 74 7tCy . T#Y 6%C % %DI %Ps N tCy 0)s )N )tCy )X 8C= T#Y $ "&	6*	6*#6* 6* 	6*
 
#Y6*r0# 0HSM 0d3i 0r6   