
    :Qg                     b   d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
mZmZ ddedededej                  fdZddededed	edef
d
Zddededed	edef
dZdedee   fdZdedee   fdZdedee   fdZdedee   fdZdedee j                      fdZdefdZdefdZdedee   fdZy)    N)ListOptional)EMAIL_ADDRESS_PATTERNEMAIL_DATETIMETZ_PATTERNIMAGE_URL_PATTERNIP_ADDRESS_NAME_PATTERNIP_ADDRESS_PATTERN_REMAPI_ID_PATTERNUS_PHONE_NUMBERS_REtextpatternindexreturnc                     t        |t              r|dk  rt        d| d      d }t        t	        j
                  ||             D ]  \  }}||k(  s|} |t        d| d d      |S )Nr   zThe index is z'. Index must be a non-negative integer.zResult with index z& was not found. The largest index was .)
isinstanceint
ValueError	enumeraterefinditer)r   r   r   regex_matchiresults         Z/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/cleaners/extract.py_get_indexed_matchr      s    eS!UQY=/VWXXKr{{7D9: !	6: K! -eW4Z[\Z]]^_``    stripc                 t    t        | ||      }|j                         \  }}| d| }|r|j                         S |S )a  Extracts texts that occurs before the specified pattern. By default, it will use
    the first occurrence of the pattern (index 0). Use the index kwarg to choose a different
    index.

    Input
    -----
    strip: If True, removes trailing whitespace from the extracted string
    N)r   spanrstrip)r   r   r   r   r   start_before_texts           r   extract_text_beforer%      sE     %T7E:K!HE1v,K#(;9k9r   c                 t    t        | ||      }|j                         \  }}| |d }|r|j                         S |S )a  Extracts texts that occurs before the specified pattern. By default, it will use
    the first occurrence of the pattern (index 0). Use the index kwarg to choose a different
    index.

    Input
    -----
    strip: If True, removes leading whitespace from the extracted string
    N)r   r    lstrip)r   r   r   r   r   r#   endr$   s           r   extract_text_afterr)   .   sE     %T7E:KFAsst*K#(;9k9r   c                 R    t        j                  t        | j                               S N)r   findallr   lowerr   s    r   extract_email_addressr/   =   s    ::+TZZ\::r   c                 6    t        j                  t        |       S r+   )r   r,   r	   r.   s    r   extract_ip_addressr1   A   s    ::+T22r   c                 6    t        j                  t        |       S r+   )r   r,   r   r.   s    r   extract_ip_address_namer3   E   s    ::-t44r   c                     t        j                  t        |       }|D cg c]  }|j                  dd       }}|S c c}w )N; )r   r,   r
   replace)r   mapi_idsmids      r   extract_mapi_idr:   I   s:    zz/40H089C$9H9O :s   <c                     t        j                  t        |       }t        |      dkD  r#t        j                  j                  |d   d      S y )Nr   z%a, %d %b %Y %H:%M:%S %z)r   r,   r   lendatetimestrptime)r   date_extractionss     r   extract_datetimetzr@   O   sE    zz":DA
q   ))*:1*=?YZZr   c                     t        j                  |       }|y|j                         \  }}| || }|j                         S )a  Extracts a US phone number from a section of text that includes a phone number. If there
    is no phone number present, the result will be an empty string.

    Example
    -------
    extract_phone_number("Phone Number: 215-867-5309") -> "215-867-5309"
    r6   )r   searchr    r   )r   r   r"   r(   phone_numbers        r   extract_us_phone_numberrD   W   sI     &,,T2K!!#JE3c?Lr   c                 h   d\  }}}}| j                         }t        d|d   vd|d   v g      r|||fS t        j                   d|d         }|d   s|d= t        |d         dkD  r|||fS |^}}|r,	 |^}}}d	j                  |      }|rd	j                  |      nd
}|||fS # t        $ r |}Y 8w xY w)a  Extracts the start of bulleted text sections bullets
    accounting numeric and alphanumeric types.

    Output
    -----
    tuple(section, sub_section, sub_sub_section): Each bullet partition
    is a string or None if not present.

    Example
    -------
    This is a very important point -> (None, None, None)
    1.1 This is a very important point -> ("1", "1", None)
    a.1 This is a very important point -> ("a", "1", None)
    )NNNNr   r   z..z[\.])r   string   r6   N)splitanyr   r<   r   join)r   abctemptext_spbulletr#   s           r   extract_ordered_bulletsrR   h   s     +MAq!TjjlG
Cwqz!471:#567!QwXXggaj9F":2J
6!9~!QwHA	HAq1 GGAJBGGAJa7N	  	A	s   3B# #B10B1c                 6    t        j                  t        |       S r+   )r   r,   r   r.   s    r   extract_image_urls_from_htmlrT      s    ::'..r   )r   )r   T)r=   r   typingr   r   unstructured.nlp.patternsr   r   r   r   r	   r
   r   strr   Matchr   boolr%   r)   r/   r1   r3   r:   r@   rD   tuplerR   rT    r   r   <module>r\      s>    	 !  S 3 s 288 :c :C : : :X[ ::S :3 :s :t :WZ :; ;S	 ;3S 3T#Y 35# 5$s) 5# $s) S Xh.?.?%@  #  "#U #L/s /tCy /r   