
    Ig2                         d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ  ej(                  e      Z e       ddd	d
dd
dZdededefdZ G d de      Zy)zWeb base loader class.    N)AnyDictIteratorListOptionalSequenceUnion)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageRefererDNT
ConnectionzUpgrade-Insecure-Requestssoupurlreturnc                    d|i}| j                  d      x}r|j                         |d<   | j                  dddi      x}r|j                  dd      |d<   | j                  d	      x}r|j                  d
d      |d<   |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findget_textget)r   r   metadatar   r   r   s         j/var/www/html/answerous/venv/lib/python3.12/site-packages/langchain_community/document_loaders/web_base.py_build_metadatar&      s    #H		'""u"!NN,iiv}.EiFF{F"-//)=T"Uyy  t #xx0DEO    c            $          e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d(dddeeee   f   dee   de	dee   d	e	d
e	dee   dee   de
dedeeeef      de	deeeef      deeeef      dede	ddf"dZedefd       Z	 d)dede
de
dedef
dZdedej&                  defdZdee   defd Zed!eddfd"       Zd*dee   d!eedf   dee   fd#Z	 	 d+ded!eedf   dee   defd$Zd*d!eedf   defd%Zdee   fd&Zdee   fd'Zy),WebBaseLoadera  
    WebBaseLoader document loader integration

    Setup:
        Install ``langchain_community``.

        .. code-block:: bash

            pip install -U langchain_community

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import WebBaseLoader

            loader = WebBaseLoader(
                web_path = "https://www.espn.com/"
                # header_template = None,
                # verify_ssl = True,
                # proxies = None,
                # continue_on_failure = False,
                # autoset_encoding = True,
                # encoding = None,
                # web_paths = (),
                # requests_per_second = 2,
                # default_parser = "html.parser",
                # requests_kwargs = None,
                # raise_for_status = False,
                # bs_get_text_kwargs = None,
                # bs_kwargs = None,
                # session = None,
                # show_progress = True,
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}


    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}

    NT)show_progressweb_pathheader_template
verify_sslproxiescontinue_on_failureautoset_encodingencoding	web_pathsrequests_per_seconddefault_parserrequests_kwargsraise_for_statusbs_get_text_kwargs	bs_kwargssessionr*   r   c                *   |r|rt        d      |rt        |      | _        n^t        |t              r	|g| _        nEt        |t
              rt        |      | _        n$t        dt        |       dt        |       d      |	| _        |
| _	        |xs i | _
        || _        || _        |xs i | _        |xs i | _        |r|| _        nt!        j"                         }|xs t$        j'                         }|j)                  d      s	 ddlm}  |       j.                  |d<   t7        |      |_        ||_        |r|j<                  j?                  |       || _        || _         || _!        || _"        y	# t0        $ r t2        j5                  d       Y qw xY w)
a6  Initialize loader.

        Args:
            web_paths: Web paths to load from.
            requests_per_second: Max number of concurrent requests to make.
            default_parser: Default parser to use for BeautifulSoup.
            requests_kwargs: kwargs for requests
            raise_for_status: Raise an exception if http status code denotes an error.
            bs_get_text_kwargs: kwargs for beatifulsoup4 get_text
            bs_kwargs: kwargs for beatifulsoup4 web page parsing
            show_progress: Show progress bar when loading pages.
        zmReceived web_path and web_paths. Only one can be specified. web_path is deprecated, web_paths should be used.z+web_path must be str or Sequence[str] got (z*) or web_paths must be Sequence[str] got ()r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)#
ValueErrorlistr2   
isinstancestrr   	TypeErrortyper3   r4   r5   r6   r*   r7   r8   r9   requestsSessiondefault_header_templatecopyr#   fake_useragentr<   randomImportErrorloggerinfodictheadersverifyr.   updater/   r0   r1   )selfr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r*   r<   s                     r%   __init__zWebBaseLoader.__init__p   s   B 	D  !)_DN#&&ZDN(+!(^DN=d8n=M N99=i8IL  $7 ,.4" 0*"4":"b"DL&&(G-O1H1M1M1OO"&&|4	84=K4F4FOL1 #?3GO'GN&&w/"DL#6  0  # KK8s   E1 1FFc                 f    t        | j                        dkD  rt        d      | j                  d   S )N   zMultiple webpaths found.r   )lenr2   r=   )rP   s    r%   r+   zWebBaseLoader.web_path   s.    t~~"788~~a  r'   r   retriescooldownbackoffc                   K   t        j                         4 d {   }t        |      D ]  }	 t        | j                  j
                  | j                  j                  j                               }| j                  j                  sd|d<    |j                  |fi |4 d {   }| j                  r|j                          |j                          d {   cd d d       d {    c cd d d       d {    S  d d d       d {    t#        d
      7 7 {7 I7 ;7 *# 1 d {  7  sw Y   nxY w# t         j                  $ r]}	||dz
  k(  r t        j                  d| d|dz    d| d|	 d		       t        j                   |||z  z         d {  7   Y d }	~	d }	~	ww xY w7 # 1 d {  7  sw Y   t#        d
      xY ww)N)rM   cookiesFsslrS   Error fetching z with attempt /z: z. Retrying...zretry count exceeded)aiohttpClientSessionrangerL   r9   rM   rY   get_dictrN   r#   r6   textClientConnectionErrorrJ   warningasynciosleepr=   )
rP   r   rU   rV   rW   r9   ikwargsresponsees
             r%   _fetchzWebBaseLoader._fetch   s     ((* 	C 	Cg7^ CC#' $ 4 4 $ 4 4 = = ?$F  <<..(-u*w{{39&9 5 5X00$557%-]]_45 5 5	C 	C 	CC	C 	C, /00-	C5  55	C5 5 5 5 44 CGaK'-cU. 1ugQwir!MC &mmHwz,ABBBC	C 	C 	C 	C, /00s   GDGF4A5D?$D 
%D?(0D(D"
D(D?(D$
)D?-F4/G;D&<GF4GF2G D?"D($D?&G(D:.D1/D:6D?=F4?F/AF*F!
F*$F4*F//F42G4G:F=;GG	semaphorec                 z  K   |4 d {    	 | j                  |       d {   cd d d       d {    S 7 /7 7 	# t        $ r[}| j                  r/t        j	                  d| d       Y d }~d d d       d {  7   yt        j                  d| d       |d }~ww xY w# 1 d {  7  sw Y   y xY ww)Nr[   z*, skipping due to continue_on_failure=True za and aborting, use continue_on_failure=True to continue loading urls after encountering an error.)rj   	Exceptionr/   rJ   rc   	exception)rP   r   rk   ri   s       r%   _fetch_with_rate_limitz$WebBaseLoader._fetch_with_rate_limit   s       	 	![[--	 	 	-	  ++NN)# /4 5 	 	 	   %cU +L L 	 	 	s   B;9B;B&?;?B;=B;?B;	B#%B-B&1B;<A?=B;BB##B&&B8,B/-B84B;urlsc                   K   t        j                  | j                        }g }|D ]8  }t        j                  | j	                  ||            }|j                  |       : 	 | j                  r"ddlm}  |j                  |dddd d{   S t        j                  |  d{   S 7 7 # t        $ r3 t        j                  d       t        j                  |  d{  7  cY S w xY ww)	z/Fetch all urls concurrently with rate limiting.r   )tqdm_asynciozFetching pagesTrS   )descasciiminintervalNz2For better logging of progress, `pip install tqdm`)rd   	Semaphorer3   ensure_futurerp   appendr*   tqdm.asynciors   gatherrI   warningswarn)rP   rq   rk   tasksr   taskrs   s          r%   	fetch_allzWebBaseLoader.fetch_all   s     %%d&>&>?	 	C(()D)DS))TUDLL		0!!50\00!11   %^^U333	 4 	0MMNO ////	0sf   AC-")B. B*B. C-B. %B,&B. )C-*B. ,B. .3C*!C$"C*'C-)C**C-parserc                 T    g d}| |vr t        ddj                  |      z   dz         y)z#Check that parser is valid for bs4.)html.parserlxmlxmlzlxml-xmlhtml5libz`parser` must be one of z, .N)r=   join)r   valid_parserss     r%   _check_parserzWebBaseLoader._check_parser	  s:     O&*TYY}-EEK  'r'   c                 :   ddl m} t        j                  | j	                  |            }g }t        |      D ]`  \  }}||   }|1|j                  d      rd}n| j                  }| j                  |       |j                   |||fi | j                         b |S )z2Fetch all urls, then return soups for all results.r   BeautifulSoup.xmlr   )bs4r   rd   runr   	enumerateendswithr4   r   ry   r8   )	rP   rq   r   r   resultsfinal_resultsrf   resultr   s	            r%   
scrape_allzWebBaseLoader.scrape_all  s    %++dnnT23"7+ 	RIAvq'C~<<'"F!00F""6*  vv!P!PQ	R r'   c                    ddl m} | |j                  d      rd}n| j                  }| j	                  |        | j
                  j                  |fi | j                  }| j                  r|j                          | j                  | j                  |_	        n| j                  r|j                  |_	         ||j                  |fi |xs i S )Nr   r   r   r   )r   r   r   r4   r   r9   r#   r5   r6   r1   r0   apparent_encodingra   )rP   r   r   r8   r   html_docs         r%   _scrapezWebBaseLoader._scrape$  s     	&>||F#,,6"#4<<##C@4+?+?@  %%'==$ $H"" ( : :HX]]FHyBHHr'   c                 n    || j                   }| j                  | j                  || j                        S )z?Scrape data from webpage and return it in BeautifulSoup format.)r   r8   )r4   r   r+   r8   )rP   r   s     r%   scrapezWebBaseLoader.scrape>  s1     >((F||DMM&DNN|SSr'   c              #      K   | j                   D ]V  }| j                  || j                        } |j                  di | j                  }t        ||      }t        ||       X yw)z+Lazy load text from the url(s) in web_path.)r8   page_contentr$   N )r2   r   r8   r"   r7   r&   r
   )rP   pathr   ra   r$   s        r%   	lazy_loadzWebBaseLoader.lazy_loadF  sc     NN 	AD<<<?D 4==;4#:#:;D&tT2Hx@@		As   A'A)c                    | j                  | j                        }g }t        | j                  |      D ]I  \  }} |j                  di | j                  }t        ||      }|j                  t        ||             K |S )z9Load text from the urls in web_path async into Documents.r   r   )r   r2   zipr"   r7   r&   ry   r
   )rP   r   docsr   r   ra   r$   s          r%   aloadzWebBaseLoader.aloadN  sz     //$..1dnng6 	HJD$ 4==;4#:#:;D&tT2HKKdXFG	H
 r'   )rm   NTNFTNr      r   NFNNN)   r   g      ?)N)NN) __name__
__module____qualname____doc__r	   r@   r   r   rL   boolintr   r   rQ   propertyr+   floatrj   rd   rw   rp   r   r   staticmethodr   r   r   r   r   r
   r   r   r   r'   r%   r)   r)   )   s   DP /1*."&$)!%"&#%#$+48!&7;.2!O!$ #%O!Xc]*+O! "$O! 	O!
 $O! "O! O! 3-O! C=O! !O! O! "$sCx.1O! O! %T#s(^4O! DcN+O!  !O!$ %O!& 
'O!b !# ! ! OR11!$1471FK1	16#*#4#4	&0DI 0# 0( c d  tCy %T	2B dSVi * $($(	II c4i I D>	I
 
I4TU39- T TA8H- A
tH~ 
r'   r)   )r   rd   loggingr|   typingr   r   r   r   r   r   r	   r]   rC   langchain_core.documentsr
   )langchain_community.document_loaders.baser   $langchain_community.utils.user_agentr   	getLoggerr   rJ   rE   r@   rL   r&   r)   r   r'   r%   <module>r      s        G G G   - @ ?			8	$ !"'(!$	 	# 	C 	D 	oJ or'   