o
    DŒhq&  ã                   @   s˜  d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZ e e¡Ze d¡Z e dej!¡Z"e d¡Z#e d¡Z$e d¡Z%e d¡Z&e d¡Z'e d¡Z(e d¡Z)g d¢Z*G dd„ dƒZ+dddefde,dee, de-de.de/de	e, fdd „Z0de,d!ee, de-fd"d#„Z1d$e,de	e, fd%d&„Z2d'ee, d$e,de	e, fd(d)„Z3dS )*z#
Deriving link info from sitemaps.
é    N)Úislice)Úsleep)ÚCallableÚListÚSetÚOptionalÚPattern)Ú	clean_urlÚextract_domainÚfilter_urlsÚfix_relative_urlsÚget_hostinfoÚlang_filteré   )Úis_similar_domain)Ú	fetch_urlÚis_live_page)Ú	MAX_LINKSÚMAX_SITEMAPS_SEENz.<loc>(?:<!\[CDATA\[)?(http.+?)(?:\]\]>)?</loc>z<xhtml:link.+?>zhref=["\'](.+?)["\']zg(?:blogger|blogpost|ghost|hubspot|livejournal|medium|typepad|squarespace|tumblr|weebly|wix|wordpress)\.z^.{0,5}<\?xml|<sitemap|<urlsetz\.xml(\..{2,4})?$|\.xml[?#]zhttps?://[^\s<"]+z
\?.*$|#.*$z\.xml\b)zsitemap.xmlzsitemap.xml.gzÚsitemapzsitemap_index.xmlzsitemap_news.xmlc                   @   sª   e Zd ZdZg d¢Z		ddededee dee d	ed
dfdd„Z	ddd„Z
ded
dfdd„Zdee dedeegdf d
dfdd„Zddd„Zddd„Zddd„ZdS )ÚSitemapObjectzCStore all necessary information on sitemap download and processing.)	Úbase_urlÚcontentÚcurrent_urlÚdomainÚexternalÚseenÚsitemap_urlsÚtarget_langÚurlsNFr   r   Úsitemapsurlsr   r   Úreturnc                 C   s<   || _ d| _|| _|| _d| _tƒ | _|| _|| _g | _	d S )NÚ )
r   r   r   r   r   Úsetr   r   r   r   )Úselfr   r   r    r   r   © r%   úU/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/sitemaps.pyÚ__init__@   s   
zSitemapObject.__init__c                 C   s0   t  d| j¡ t| jƒpd| _| j | j¡ dS )z!Fetch a sitemap over the network.zfetching sitemap: %sr"   N)ÚLOGGERÚdebugr   r   r   r   Úadd©r$   r%   r%   r&   ÚfetchR   s   zSitemapObject.fetchÚlinkc                 C   s¼   || j krdS t| j|ƒ}t|| jƒpd}|rt|| jƒsdS t|dd}|du r1t d|¡ dS | j	sIt
 |¡sIt| j|ƒsIt d| j|¡ dS t |¡rV| j |¡ dS | j |¡ dS )z^Examine a link and determine if it's valid and if it leads to
        a sitemap or a web page.Nr"   T)Úfastzcouldn't extract domain: %sz-link discarded, diverging domain names: %s %s)r   r   r   r	   r   r   r
   r(   Úerrorr   ÚWHITELISTED_PLATFORMSÚsearchr   r   ÚwarningÚDETECT_SITEMAP_LINKr   Úappendr   )r$   r-   Ú	newdomainr%   r%   r&   Úhandle_linkX   s.   
ÿþ
ýÿ
zSitemapObject.handle_linkÚregexÚindexÚhandlerc                    sP   ‡ fdd„t | | j¡tƒD ƒD ]}||ƒ qt dt| jƒt| jƒ| j	¡ dS )zJExtract links from the content using pre-defined regex, index and handler.c                 3   s    | ]}|ˆ  V  qd S )Nr%   )Ú.0Úm©r8   r%   r&   Ú	<genexpr>~   s   € 
ÿz.SitemapObject.extract_links.<locals>.<genexpr>z%%s sitemaps and %s links found for %sN)
r   Úfinditerr   r   r(   r)   Úlenr   r   r   )r$   r7   r8   r9   Úmatchr%   r<   r&   Úextract_linksz   s   

ÿ
üzSitemapObject.extract_linksc                    sP   dˆj vrdS t dˆj› dtj¡‰ dtddf‡ ‡fdd„}ˆ td	|¡ dS )
z7Extract links corresponding to a given target language.z	hreflang=Nzhreflang=[\"'](z.*?|x-default)[\"']Úattrsr!   c                    s2   ˆ   | ¡rt  | ¡}|rˆ |d ¡ dS dS dS )z!Examine language code attributes.r   N)r1   ÚHREFLANG_REGEXr6   )rB   Ú
lang_match©Ú
lang_regexr$   r%   r&   Úhandle_lang_link’   s   

ýzASitemapObject.extract_sitemap_langlinks.<locals>.handle_lang_linkr   )r   ÚreÚcompiler   ÚDOTALLÚstrrA   ÚXHTML_REGEX)r$   rG   r%   rE   r&   Úextract_sitemap_langlinks‰   s   
ÿz'SitemapObject.extract_sitemap_langlinksc                 C   s   |   td| j¡ dS )z=Extract sitemap links and web page links from a sitemap file.r   N)rA   Ú
LINK_REGEXr6   r+   r%   r%   r&   Úextract_sitemap_links›   s   ÿz#SitemapObject.extract_sitemap_linksc                 C   sd   t | j| jƒ}|sdS t | j¡s|  td| j¡ dS | jdur,|  	¡  | j
s*| jr,dS |  ¡  dS )z5Download a sitemap and extract the links it contains.Nr   )Úis_plausible_sitemapr   r   ÚSITEMAP_FORMATr@   rA   ÚDETECT_LINKSr6   r   rM   r   r   rO   )r$   Ú	plausibler%   r%   r&   Úprocess¡   s   
zSitemapObject.process)NF)r!   N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú	__slots__rK   r   r   Úboolr'   r,   r6   r   Úintr   rA   rM   rO   rT   r%   r%   r%   r&   r   2   s@    úþýüûú
ù
"ÿÿÿ
þ

r   Fg       @Úurlr   r   Ú
sleep_timeÚmax_sitemapsr!   c                    s<  t | ƒ\}‰ |du rt d| ¡ g S tˆ ƒst d| ¡ g S d}|  d¡r)| g}ng }t| ƒtˆ ƒd kr7| }tˆ ||||ƒ‰ˆjsPtˆ ƒpN‡ fdd„t	D ƒˆ_ˆjrˆtˆj
ƒ|k rˆˆj ¡ ˆ_ˆ ¡  ˆ ¡  ‡fdd„ˆjD ƒˆ_tˆj
ƒ|k r~t|ƒ ˆjrˆtˆj
ƒ|k sZ|r‘tˆj|ƒˆ_t d	tˆjƒ|¡ ˆjS )
ax  Look for sitemaps for the given URL and gather links.

    Args:
        url: Webpage or sitemap URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
                     (two-letter string, ISO 639-1 format).
        external: Similar hosts only or external URLs
                  (boolean, defaults to False).
        sleep_time: Wait between requests on the same website.
        max_sitemaps: Maximum number of sitemaps to process.

    Returns:
        The extracted links as a list (sorted list of unique links).

    Nzinvalid URL: %sz*base URL unreachable, dropping sitemap: %s)z.gzr   z.xmlé   c                    s   g | ]	}ˆ › d |› ‘qS )ú/r%   )r:   Úg©Úbaseurlr%   r&   Ú
<listcomp>á   s    ÿz"sitemap_search.<locals>.<listcomp>c                    s   g | ]	}|ˆ j vr|‘qS r%   )r   )r:   Ús)r   r%   r&   rd   ë   s    z%s sitemap links found for %s)r   r(   r2   r   Úendswithr?   r   r   Úfind_robots_sitemapsÚGUESSESr   Úpopr   r,   rT   r   r   r   r)   )r\   r   r   r]   r^   Ú
domainnameÚ	urlfilterÚsitemapurlsr%   )rc   r   r&   Úsitemap_search³   s@   
ÿ
ÿörm   Úcontentsc                 C   s^   |du rdS t  d| ¡} t | ¡rt|tƒr%t |¡r%d|dd…  ¡ v r-t	 
d| ¡ dS dS )zLCheck if the sitemap corresponds to an expected format,
    i.e. TXT or XML.NFr"   z<htmlé–   znot a valid XML sitemap: %sT)ÚSCRUB_REGEXÚsubÚPOTENTIAL_SITEMAPr1   Ú
isinstancerK   rQ   r@   Úlowerr(   r2   )r\   rn   r%   r%   r&   rP   ù   s   ÿþþrP   rc   c                 C   s   t | d ƒ}t|| ƒS )zUGuess the location of the robots.txt file and try to extract
    sitemap URLs from itz/robots.txt)r   Úextract_robots_sitemaps)rc   Ú	robotstxtr%   r%   r&   rg     s   
rg   rv   c                    sÔ   | du s
t | ƒdkrg S g }|  ¡ D ]=}| d¡}|dkr#|d|… }| ¡ }|s*q| dd¡}t |ƒdkrO|d  ¡  ¡ |d< |d dkrO| |d  ¡ ¡ qtt 	|¡ƒ}‡ fd	d
„|D ƒ}t
 dt |ƒ¡ |S )z.Read a robots.txt file and find sitemap links.Ni'  ú#r   ú:r   r_   r   c                    s   g | ]	}|rt ˆ |ƒ‘qS r%   )r   )r:   Úurb   r%   r&   rd   -  s    z+extract_robots_sitemaps.<locals>.<listcomp>z%s sitemaps found in robots.txt)r?   Ú
splitlinesÚfindÚstripÚsplitrt   r4   ÚlistÚdictÚfromkeysr(   r)   )rv   rc   Ú
candidatesÚlineÚiÚ
line_partsrl   r%   rb   r&   ru     s(   
€ru   )4rX   ÚloggingrH   Ú	itertoolsr   Útimer   Útypingr   r   r   r   r   Úcourlanr	   r
   r   r   r   r   Údeduplicationr   Ú	downloadsr   r   Úsettingsr   r   Ú	getLoggerrU   r(   rI   rN   rJ   rL   rC   r0   rQ   r3   rR   rp   rr   rh   r   rK   rZ   Úfloatr[   rm   rP   rg   ru   r%   r%   r%   r&   Ú<module>   sZ     	


ÿ




	 ûÿþýüû
úF"