o
    DŒhC  ã                   @   s¤  d Z ddlZddlZddlmZmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZ e e¡Ze d	¡Ze d
¡Ze d¡Ze d¡Ze d¡Ze d¡Ze d¡Ze d¡Ze d¡Z e d¡Z!d-de"dee" dee" fdd„Z#de"de"fdd„Z$	d.de"de%dee" de"fdd„Z&de"de"fd d!„Z'd"e"de"fd#d$„Z(d-d%e"dee" de"fd&d'„Z)			(d/d)eee"f de%dee" d*e%de"f
d+d,„Z*dS )0z0
Functions performing URL trimming and cleaning
é    N)ÚOptionalÚUnion)Úparse_qsÚquoteÚ	urlencodeÚ
urlunsplitÚSplitResulté   )Úis_valid_url)ÚALLOWED_PARAMSÚLANG_PARAMSÚTARGET_LANGS)Ú_parsez	https?://zZ(https?://[^">&? ]+?)(?:https?://)|(?:https?://[^/]+?/[^/]+?[&?]u(rl)?=)(https?://[^"> ]+)z)https?://.+?(https?://.+?)(?:https?://|$)z(?<=\w):(?:80|443)z/+z^(?:/\.\.(?![^/]))+z</?[a-z]{,4}?>|{.+?}z/\&$z(.*?)[<>"\s]zí^(?:dc|fbc|gc|twc|yc|ysc)lid|^(?:click|gbra|msclk|igsh|partner|wbra)id|^(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)ÚurlÚlanguageÚreturnc              	   C   s,   z	t t| ƒd|ƒW S  ttfy   Y dS w )z4Helper function: chained scrubbing and normalizationFN)Únormalize_urlÚ	scrub_urlÚAttributeErrorÚ
ValueError)r   r   © r   úN/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/courlan/clean.pyÚ	clean_url0   s
   ÿr   c                 C   sJ  d  |  ¡ ¡ d¡} |  d¡r|  dd¡ dd¡} t d| ¡} t d|  dd¡¡} t 	| ¡}t
|ƒdkrpd| vrpt d	t
|ƒ| ¡ t | ¡}|rYt|d ƒrY|d } t d
| ¡ nt | ¡}|rpt|d ƒrp|d } t d
| ¡ t | ¡}|r{|d } t
| ƒdkrt d| dd… d t
| ƒ¡ |  d¡dksž|  d¡dkr£|  d¡} | S )z@Strip unnecessary parts and make sure only one URL is consideredÚ z  	
z	<![CDATA[z]]>z&amp;ú&r	   zweb.archive.orgzdouble url: %s %sztaking url: %siô  z$invalid-looking link %s of length %dNé2   u   â€¦ú/é   z://)ÚjoinÚsplitÚstripÚ
startswithÚreplaceÚREMAINING_MARKUPÚsubÚTRAILING_AMPÚ	PROTOCOLSÚfindallÚlenÚLOGGERÚdebugÚ	SELECTIONÚmatchr
   Ú
MIDDLE_URLÚTRAILING_PARTSÚcountÚrstrip)r   Ú	protocolsr,   r   r   r   r   8   s4   ÿ





r   FÚquerystringÚstrictc                 C   s    | sdS t | ƒ}i }t|ƒD ];}| ¡ }|r |tvr|tvrqnt |¡r&q|tv rC|tv rCt|| d ƒt| vrCt	 
d||¡ t‚|| ||< qt|ddS )zStrip unwanted query elementsr   r   zbad lang: %s %sT)Údoseq)r   ÚsortedÚlowerr   r   ÚTRACKERS_REÚsearchr   Ústrr)   r*   r   r   )r2   r3   r   ÚqdictÚnewqdictÚqelemÚteststrr   r   r   Úclean_queryj   s&   €
r>   Ústringc              	   C   sv   d| vr| S g }|   d¡D ](}| ¡  d¡r0z
| d¡ d¡}W n ty/   t d|¡ Y nw | |¡ qd 	|¡S )z@Probe for punycode in lower-cased hostname and try to decode it.zxn--Ú.Úutf8Úidnazinvalid utf/idna string: %s)
r   r6   r!   ÚencodeÚdecodeÚUnicodeErrorr)   r*   Úappendr   )r?   ÚpartsÚpartr   r   r   Údecode_punycode‹   s   ÿ
rI   Úurl_partc                 C   s   t | ddS )zbNormalize URLs parts (specifically path and fragment) while
    accounting for certain characters.z/%!=:,-)Úsafe)r   )rJ   r   r   r   Únormalize_part   s   rL   Úfragmentc                 C   s:   d| v rd| v rt | d|ƒ} t| ƒS t | ¡rd} t| ƒS )zNLook for trackers in URL fragments using query analysis, normalize the output.ú=r   Fr   )r>   r7   r8   rL   )rM   r   r   r   r   Únormalize_fragment£   s   
þrO   TÚ
parsed_urlÚtrailing_slashc           	      C   sÖ   t | ƒ} | j ¡ }t| j ¡ ƒ}z| jdv rt d|¡}W n	 ty&   Y nw t	t
 dt d| j¡¡ƒ}t| j||ƒp<d}|rD|sDd}n|sX|sXt|ƒdkrX| d¡rX| d¡}|r\dnt| j|ƒ}t|||||fƒS )zFTakes a URL string or a parsed URL and returns a normalized URL string)éP   i»  r   r   r	   )r   Úschemer6   rI   ÚnetlocÚportÚ	NETLOC_REr$   r   rL   ÚPATH2ÚPATH1Úpathr>   Úqueryr(   Úendswithr0   rO   rM   r   )	rP   r3   r   rQ   rS   rT   ÚnewpathÚnewqueryÚnewfragmentr   r   r   r   ­   s0   

€ÿÿþÿ
r   )N)FN)FNT)+Ú__doc__ÚloggingÚreÚtypingr   r   Úurllib.parser   r   r   r   r   Úfiltersr
   Úsettingsr   r   r   Úurlutilsr   Ú	getLoggerÚ__name__r)   Úcompiler&   r+   r-   rV   rX   rW   r#   r%   r.   r7   r9   r   r   Úboolr>   rI   rL   rO   r   r   r   r   r   Ú<module>   sh    

ÿ






ÿ 	3ÿÿÿÿ
þ!ü
ÿþýüû