o
    Dh%                  
   @   st  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZ dd	lmZ d
dlmZ d
dlmZ edZeeddd Zeddd1dededede fddZ!d2dededee fddZ"d3dedede#fd d!Z$G d"d# d#Z%d$edefd%d&Z&d'\Z'Z(Z)Z*G d(d) d)Z+e+edZ,d*eddfd+d,Z-d-ed.ede fd/d0Z.dS )4z>Code parts dedicated to duplicate removal and text similarity.    N)SequenceMatcher)	lru_cache)blake2b)add)RLock)AnyDictListOptionalUnion)_Element   )LRU_SIZE)trimz\.[^/?#]{2,63}$	bit_countc                 C   s   t | dS )N1)bincount)x r   Z/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/deduplication.py<lambda>   s    r   i   maxsize      ?	reference
new_string	thresholdreturnc                 C   s,   t d| } t d|}td| | |kS )zIReturn the similarity ratio between two short strings, here domain names. N)STRIP_EXTENSIONsubr   ratio)r   r   r   r   r   r   is_similar_domain   s   r#   @   inputstringlengthc                    st   g }|   D ]}|tj}| r|| qg }tdddD ]  fdd|D }t||d kr7|  S q |S )zbSplit input into list of tokens and adjust length threshold to make sure
    there is enough data.   c                    s   g | ]
}t | kr|qS r   )len).0tir   r   
<listcomp>-   s    z!sample_tokens.<locals>.<listcomp>   )splitstripstringpunctuationisalnumappendranger)   )r%   r&   tokenstokensampler   r,   r   sample_tokens#   s   
r:      c                 C   s&   d t|  }t| |d S )z=Create a bag of words and generate a hash for a given string. digest_size)joinr:   r1   r   encodedigest)r%   r&   
teststringr   r   r   generate_bow_hash3   s   rC   c                	   @   s   e Zd ZdZddgZ			d!dededee d	dfd
dZded	efddZ	e
ddded	ee fddZded	efddZd	efddZded	ee fddZdeeeef  d	ee fddZded	efddZded	efdd ZdS )"SimhashzAImplement a basic Charikar hashing approach of string similarity.hashr&   r   r$   Nr%   existing_hashr   c                 C   s    || _ | |p| || _dS )z&Store length and existing or new hash.N)r&   validatecreate_hashrE   )selfr%   r&   rF   r   r   r   __init__>   s   zSimhash.__init__c                 C   s   t t| dd dS )z&Return a numerical hash of the string.   r=   big)int
from_bytesr   r@   rA   )rI   r%   r   r   r   _hashH   s   zSimhash._hashi @  r   r8   c                    s    fddt  jD S )z2Create vector to add to the existing string vectorc                    s&   g | ]}  d |> @ rd ndqS )r   r(   )rO   r*   r-   rI   r8   r   r   r.   ]   s   & z*Simhash._vector_to_add.<locals>.<listcomp>)r6   r&   rQ   r   rQ   r   _vector_to_addZ   s   zSimhash._vector_to_addc                    sP   dg| j   t|| j D ]}ttt | | qt fddt| j D S )zCalculates a Charikar simhash. References used:
        https://github.com/vilda/shash/
        https://github.com/sean-public/python-hashes/blob/master/hashes/simhash.py
        Optimized for Python by @adbar.
        r   c                 3   s$    | ]} | d krd|> V  qdS )r   r   Nr   rP   vectorr   r   	<genexpr>j   s   " z&Simhash.create_hash.<locals>.<genexpr>)r&   r:   listmapr   rR   sumr6   )rI   r%   r8   r   rS   r   rH   _   s   zSimhash.create_hashc                 C   s   t | jdd S )z3Convert the numerical hash to a hexadecimal string.r/   N)hexrE   rI   r   r   r   to_hexl   s   zSimhash.to_hex	inputhashc              	   C   s&   zt |dW S  ttfy   Y dS w )z2Convert the hexadecimal hash to a numerical value.   N)rM   	TypeError
ValueErrorrI   r\   r   r   r   _hash_to_intp   s
   zSimhash._hash_to_intc                 C   sj   t |trdtt|  krdkr|S  t |tr3| r.dt|  kr-dkr.t|S  | |S dS )z9Validate the input hash and return it, or None otherwise.      N)
isinstancerM   r)   strisdigitra   r`   r   r   r   rG   w   s   $

zSimhash.validate
other_hashc                 C   s   t | j|jA S )zJReturn distance between two hashes of equal length using the XOR operator.)BIN_COUNT_FUNCrE   rI   rg   r   r   r   hamming_distance   s   zSimhash.hamming_distancec                 C   s   | j | | | j  S )zjCalculate how similar this hash is from another simhash.
        Returns a float from 0.0 to 1.0.
        )r&   rj   ri   r   r   r   
similarity   s   zSimhash.similarity)r   r$   N)__name__
__module____qualname____doc__	__slots__re   rM   r
   rJ   rO   r   r	   rR   rH   r[   ra   r   rG   r   rj   floatrk   r   r   r   r   rD   :   s0    

"rD   contentc                 C   s   t |  S )zACalculate a simhash hex value for meaningful bits of the content.)rD   r[   )rr   r   r   r   content_fingerprint   s   rs   )r   r   r/      c                   @   sh   e Zd ZdZddeddfddZdedefd	d
ZdedefddZde	deddfddZ
dddZdS )LRUCachea  
    Pure-Python Least Recently Used (LRU) cache using a circular doubly linked list
    Adapted from CPython functools.py lru_cache decorator implementation
    https://github.com/python/cpython/blob/3.9/Lib/functools.py#L524
    First adapted by https://github.com/vbarbaresi
       r   r   Nc                 C   s>   t  | _|| _i | _g | _| j| jd d g| jd d < d| _d S )NF)r   lockr   cacherootfull)rI   r   r   r   r   rJ      s   
zLRUCache.__init__linkc                 C   sP   |\}}}}|||t < |t< | jt }| |t < | jt< ||t< | j|t < |S )N)NEXTPREVry   )rI   r{   	link_prev	link_next_keyresultlastr   r   r   
_move_link   s   

zLRUCache._move_linkkeyc                 C   sT   | j  | j|}|r| |W  d   S W d   dS 1 s#w   Y  dS )zgTests if the key that is asked for is in the cache
        and retrieve its value from the linked list.Nr(   )rw   rx   getr   )rI   r   r{   r   r   r   r      s   
zLRUCache.getvaluec                 C   s  | j ~ | j|}|r| | || j| t< nX| jrF| j}|||t< |t< |t | _| jt }d | jt< | jt< | j|= || j|< n3| jt	 }|| j||g}| |t<  | jt	< | j|< t
| j| jk| _W d   dS W d   dS W d   dS 1 sw   Y  dS )z Stores a given key in the cache.N)rw   rx   r   r   RESULTrz   ry   KEYr|   r}   r)   r   )rI   r   r   r{   oldrootoldkeyr   r   r   r   put   s,   



"zLRUCache.putc                 C   sV   | j  | j  | j| jddg| jdd< d| _W d   dS 1 s$w   Y  dS )zDelete all cache content.NF)rw   rx   clearry   rz   rZ   r   r   r   r      s
   
"zLRUCache.clear)rv   )r   N)rl   rm   rn   ro   rM   rJ   r   r   r   re   r   r   r   r   r   r   ru      s    
	%ru   rB   c                 C   s.   t | }|dkr|d nd}t | | dS )zImplement LRU cache.r(   r   N)LRU_TESTr   r   )rB   cachevalr   r   r   r   put_in_cache   s   
r   elementoptionsc                 C   sT   t d|  }t||jkr$t|}||jkr$t||d  dS t	| dS )z(Check for duplicate text with LRU cache.r<   r   TF)
r   r?   itertextr)   min_duplcheck_sizer   r   max_repetitionsr   r   )r   r   rB   r   r   r   r   duplicate_test   s   

r   )r   )r$   )r;   )/ro   rer2   difflibr   	functoolsr   hashlibr   operatorr   	threadingr   typingr   r   r	   r
   r   
lxml.etreer   settingsr   utilsr   compiler    getattrrM   rh   re   rq   boolr#   r:   bytesrC   rD   rs   r}   r|   r   r   ru   r   r   r   r   r   r   r   <module>   s2    
S
S