o
    DhG                     @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlmZmZmZmZ ddlmZ edZedZed	Zed
ejZ	d-dedede
e	d e	eef f fddZ	d.dedeee  dedee fddZdedefddZdedefddZdede	eef fddZdede	ee ef fddZdededefddZ d ee d!ee dee fd"d#Z!d/ded%ed&edefd'd(Z"d)ed*ee defd+d,Z#dS )0zD
Functions related to URL manipulation and extraction of URL parts.
    N)unescape)AnyListOptionalSetTupleUnion)urljoinurlsplit
urlunsplitSplitResult)get_tldz{(?:(?:f|ht)tp)s?://(?:[^/?#]{,63}\.)?([^/?#.]{4,63}\.[^/?#]{2,63}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[0-9a-f:]{16,})(?:/|$)z(?<=\D):\d+z^www[0-9]*\.z(?:feed(?:burner|proxy))FurlfastreturnNNc                 C   s   | rt | ts	dS |r,t| }|r,td|d dd }|dd }|r,||fS t| ddd	}|d
u r9dS |jt	d|j
fS )z0Cached function to extract top-level domain infor       @.r   T)	as_objectfail_silentlyN)
isinstancestrDOMAIN_REGEXmatchSTRIP_PORT_REGEXsubsplitr   domainCLEAN_FLD_REGEXfld)r   r   domain_matchfull_domainclean_matchtldinfo r'   Q/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/courlan/urlutils.pyget_tldinfo   s   
r)   	blacklistc                 C   s:   |du rt  }t| |d\}}|r||vr||vr|S dS )z;Extract domain name information using top-level domain infoNr   )setr)   )r   r*   r   r    r$   r'   r'   r(   extract_domain1   s   r-   c                 C   s:   t | trtt| }|S t | tr| }|S tdt| )z3Parse a string or use urllib.parse object directly.zwrong input type:)r   r   r
   r   r   	TypeErrortype)r   
parsed_urlr'   r'   r(   _parseA   s   

r1   c                 C   s(   t | }|jr|jd }nd}||j S )ziStrip URL of some of its parts to get base URL.
    Accepts strings and urllib.parse ParseResult objects.z://r   )r1   schemenetloc)r   r0   r2   r'   r'   r(   get_base_urlL   s
   
r4   c                 C   sR   t | }t|}tdd|j|j|jg}|dkrd}|r|s%td|  ||fS )zvDecompose URL in two parts: protocol + host/domain and path.
    Accepts strings and urllib.parse ParseResult objects.r   /zincomplete URL: )r1   r4   r   pathqueryfragment
ValueError)r   r0   hostnamepathvalr'   r'   r(   get_host_and_pathW   s   r<   c                 C   s   t | dd}t| }||fS )zXConvenience function returning domain and host info (protocol + host/domain) from a URL.Tr+   )r-   r4   )r   
domainnamebase_urlr'   r'   r(   get_hostinfog   s   r?   baseurlc                 C   sR   | dr|S t| j}t|}|j|dfvr$|jr|S t|jddS t| |S )z8Prepend protocol and host information to relative links.{r   http)r2   )
startswithr
   r3   r2   r   _replacer	   )r@   r   base_netloc	split_urlr'   r'   r(   fix_relative_urlsn   s   


rG   	link_list	urlfilterc                    sD    du r
t t| S  fdd| D }|sdd | D }t t|S )zDReturn a list of links corresponding to the given substring pattern.Nc                    s   g | ]} |v r|qS r'   r'   .0lrI   r'   r(   
<listcomp>   s    zfilter_urls.<locals>.<listcomp>c                 S   s   g | ]	}t |r|qS r'   )FEED_WHITELIST_REGEXsearchrJ   r'   r'   r(   rN      s    )sortedr,   )rH   rI   filtered_listr'   rM   r(   filter_urls~   s   rS   T	referenceignore_suffixc                 C   s4   t |dd\}}t | dd\}}|r||kS ||kS )zjDetermine if a link leads to another host, takes a reference URL and
    a URL as input, returns a booleanTr+   )r)   )r   rT   rU   stripped_refrefstripped_domainr    r'   r'   r(   is_external   s
   rY   linkknown_linksc                 C   s   | |v rdS | d dkr|  dn| d }||v rdS | drN| dr-d| dd  nd| dd  }|d dkr@| dn|d }||v sL||v rNdS d	S )
zDCompare the link and its possible variants to the existing URL base.Tr   r5   rB   httpsN      F)rstriprC   )rZ   r[   
slash_testprotocol_testr'   r'   r(   is_known_link   s   
(
rb   )F)NF)T)$__doc__rehtmlr   typingr   r   r   r   r   r   urllib.parser	   r
   r   r   tldr   compiler   r   r!   IrO   r   boolr)   r-   r1   r4   r<   r?   rG   rS   rY   rb   r'   r'   r'   r(   <module>   sL     




"