
    (#h%              
          d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZ dd	lmZ d
dlmZ d
dlmZ  ej2                  d      Z eedd       Z ed      d'dededede fd       Z!d(dededee   fdZ"d)dedede#fdZ$ G d d      Z%dedefdZ&d\  Z'Z(Z)Z* G d  d!      Z+ e+e      Z,d"eddfd#Z-d$ed%ede fd&Z.y)*z>Code parts dedicated to duplicate removal and text similarity.    N)SequenceMatcher)	lru_cache)blake2b)add)RLock)AnyDictListOptionalUnion)_Element   )LRU_SIZE)trimz\.[^/?#]{2,63}$	bit_countc                 6    t        |       j                  d      S )N1)bincount)xs    V/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/deduplication.py<lambda>r      s    SV\\#5F     i   maxsize	reference
new_string	thresholdreturnc                     t         j                  d|       } t         j                  d|      }t        d| |      j                         |k\  S )zIReturn the similarity ratio between two short strings, here domain names. N)STRIP_EXTENSIONsubr   ratio)r   r   r   s      r   is_similar_domainr%      sE      ##B	2I $$R4J4J7==?9LLr   inputstringlengthc                 N   g }| j                         D ]C  }|j                  t        j                        }|j	                         s3|j                  |       E g }t        ddd      D ]5  }|D cg c]  }t        |      |kD  s| }}t        |      |dz  k\  s3|c S  |S c c}w )zbSplit input into list of tokens and adjust length threshold to make sure
    there is enough data.      )splitstripstringpunctuationisalnumappendrangelen)r&   r'   tokenstokensampleits          r   sample_tokensr9   #   s     F""$ !F../==?MM% ! F1b" #2s1vz!22v;&1*$M M 3s   /B"B"c                     dj                  t        |             j                         }t        |j	                         |      j                         S )z=Create a bag of words and generate a hash for a given string. digest_size)joinr9   r-   r   encodedigest)r&   r'   
teststrings      r   generate_bow_hashrB   3   s=    -45;;=J:$$&F;BBDDr   c            	           e Zd ZdZddgZ	 	 	 ddededee   ddfdZdedefd	Z	 e
d
      dedee   fd       ZdedefdZdefdZdedee   fdZdeeeef      dee   fdZdedefdZdedefdZy)SimhashzAImplement a basic Charikar hashing approach of string similarity.hashr'   Nr&   existing_hashr   c                 d    || _         | j                  |      xs | j                  |      | _        y)z&Store length and existing or new hash.N)r'   validatecreate_hashrE   )selfr&   r'   rF   s       r   __init__zSimhash.__init__>   s+     MM-0QD4D4D[4Q	r   c                 |    t         j                  t        |j                         d      j	                         d      S )z&Return a numerical hash of the string.   r<   big)int
from_bytesr   r?   r@   )rJ   r&   s     r   _hashzSimhash._hashH   s1    ~~K&&(a8??A5
 	
r   i @  r   r5   c                     t        | j                        D cg c]  }| j                  |      d|z  z  rdnd c}S c c}w )z2Create vector to add to the existing string vectorr   r*   )r2   r'   rQ   )rJ   r5   r7   s      r   _vector_to_addzSimhash._vector_to_addZ   s<     DICUVaTZZ&!q&1r9VVVs   "=c           
          dg| j                   z  t        || j                         D ]+  }t        t        t        | j                  |                  - t        fdt        | j                         D              S )zCalculates a Charikar simhash. References used:
        https://github.com/vilda/shash/
        https://github.com/sean-public/python-hashes/blob/master/hashes/simhash.py
        Optimized for Python by @adbar.
        r   c              3   :   K   | ]  }|   d k\  sd|z    yw)r   r   N ).0r7   vectors     r   	<genexpr>z&Simhash.create_hash.<locals>.<genexpr>j   s     Haa16Hs   
)r'   r9   listmapr   rS   sumr2   )rJ   r&   r5   rX   s      @r   rI   zSimhash.create_hash_   sk     t{{"";< 	HE#c64+>+>u+EFGF	H H5#5HHHr   c                 2    t        | j                        dd S )z3Convert the numerical hash to a hexadecimal string.r+   N)hexrE   rJ   s    r   to_hexzSimhash.to_hexl   s    499~ab!!r   	inputhashc                 F    	 t        |d      S # t        t        f$ r Y yw xY w)z2Convert the hexadecimal hash to a numerical value.   N)rO   	TypeError
ValueErrorrJ   ra   s     r   _hash_to_intzSimhash._hash_to_intp   s+    	y"%%:& 		s      c                    t        |t              r dt        t        |            cxk  rdk  r|S  t        |t              rA|j	                         r dt        |      cxk  rdk  rt        |      S  | j                  |      S y)z9Validate the input hash and return it, or None otherwise.      N)
isinstancerO   r3   strisdigitrg   rf   s     r   rH   zSimhash.validatew   sy    i%"C	N0C*Ir*I +Ji%  "rS^'Ar'A9~% (B $$Y//r   
other_hashc                 F    t        | j                  |j                  z        S )zJReturn distance between two hashes of equal length using the XOR operator.)BIN_COUNT_FUNCrE   rJ   rn   s     r   hamming_distancezSimhash.hamming_distance   s    dii*//9::r   c                 X    | j                   | j                  |      z
  | j                   z  S )zjCalculate how similar this hash is from another simhash.
        Returns a float from 0.0 to 1.0.
        )r'   rr   rq   s     r   
similarityzSimhash.similarity   s'     d33J??4;;NNr   )r!   @   N)__name__
__module____qualname____doc__	__slots__rl   rO   r   rK   rQ   r   r
   rS   rI   r`   rg   r   rH   r   rr   floatrt   rV   r   r   rD   rD   :   s   G"I '+	RR R  }	R
 
R
 
 
$ uWC WDI W WIs Is I" "c hsm 	(5c?"; 	 	;3 ;3 ;OS OU Or   rD   contentc                 4    t        |       j                         S )zACalculate a simhash hex value for meaningful bits of the content.)rD   r`   )r|   s    r   content_fingerprintr~      s    7""$$r   )r   r   r+      c                   ^    e Zd ZdZddeddfdZdedefdZdedefd	Zde	d
eddfdZ
ddZy)LRUCachea  
    Pure-Python Least Recently Used (LRU) cache using a circular doubly linked list
    Adapted from CPython functools.py lru_cache decorator implementation
    https://github.com/python/cpython/blob/3.9/Lib/functools.py#L524
    First adapted by https://github.com/vbarbaresi
    r   r   Nc                     t               | _        || _        i | _        g | _        | j                  | j                  d d g| j                  d d  d| _        y )NF)r   lockr   cacherootfull)rJ   r   s     r   rK   zLRUCache.__init__   sE    G	+-
!			499dD9		!	r   linkc                     |\  }}}}||c|t         <   |t        <   | j                  t           }|x|t         <   | j                  t        <   ||t        <   | j                  |t         <   |S )N)NEXTPREVr   )rJ   r   	link_prev	link_next_keyresultlasts          r   
_move_linkzLRUCache._move_link   sb    -1*	9dF+4i(	$4yy'++T
TYYt_T
YYT
r   keyc                     | j                   5  | j                  j                  |      }|r| j                  |      cddd       S 	 ddd       y# 1 sw Y   yxY w)zgTests if the key that is asked for is in the cache
        and retrieve its value from the linked list.Nr*   )r   r   getr   )rJ   r   r   s      r   r   zLRUCache.get   sS     YY 	-::>>#&Dt,	- 	-	- 		- s   .AAvaluec                    | j                   5  | j                  j                  |      }|r)| j                  |       || j                  |   t        <   n| j
                  r| j                  }||c|t        <   |t        <   |t           | _        | j                  t           }dx| j                  t        <   | j                  t        <   | j                  |= || j                  |<   nu| j                  t           }|| j                  ||g}|x|t        <   x| j                  t        <   | j                  |<   t        | j                        | j                  k\  | _        ddd       y# 1 sw Y   yxY w)z Stores a given key in the cache.N)r   r   r   r   RESULTr   r   KEYr   r   r3   r   )rJ   r   r   r   oldrootoldkeyr   s          r   putzLRUCache.put   s*    YY  	@::>>#&D%*/

3'99"iiG471GCL'&/ !(DI!YYs^F9==DIIcNTYYv%6

6* '.DJJsO  99T?D $))S%8DEIIDJI44::c? !$DJJ4<< ?DIA 	@  	@  	@s   EE!!E*c                     | j                   5  | j                  j                          | j                  | j                  ddg| j                  dd d| _        ddd       y# 1 sw Y   yxY w)zDelete all cache content.NF)r   r   clearr   r   r_   s    r   r   zLRUCache.clear   sS    YY 	JJ IItyy$=DIIaLDI	 	 	s   A	AA()   )r   N)rv   rw   rx   ry   rO   rK   r   r   r   rl   r   r   rV   r   r   r   r      sd    	 	d 	s s s s #@s #@3 #@4 #@Jr   r   rA   c                 r    t         j                  |       }|dk7  r|dz   nd}t         j                  | |       y)zImplement LRU cache.r*   r   N)LRU_TESTr   r   )rA   cachevalr   s      r   put_in_cacher      s/    ||J'H$NHqLELLU#r   elementoptionsc                    t        dj                  | j                                     }t        |      |j                  kD  r>t
        j                  |      }||j                  kD  rt
        j                  ||dz          yt        |       y)z(Check for duplicate text with LRU cache.r;   r   TF)
r   r>   itertextr3   min_duplcheck_sizer   r   max_repetitionsr   r   )r   r   rA   r   s       r   duplicate_testr      sl    chhw//123J
:333<<
+g---LLX\2r   )g      ?)ru   )   )/ry   rer.   difflibr   	functoolsr   hashlibr   operatorr   	threadingr   typingr   r	   r
   r   r   
lxml.etreer   settingsr   utilsr   compiler"   getattrrO   rp   rl   r{   boolr%   r9   bytesrB   rD   r~   r   r   r   r   r   r   r   r   rV   r   r   <module>r      s@   @
 
  #     3 3    "**/0k+FG 4M M# M% MRV M Ms C c  E3 E EU EPO POf% % %
 % dCP Pf H%$S $T $H s t r   