o
    Dhd/                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZmZmZmZmZ zddlZW n	 eyA   Y nw ddlmZmZ d	d
lmZmZ d	dlmZmZmZ d	dlm Z  d	dl!m"Z"m#Z#m$Z$ e%e&Z'edddZ(dZ)dZ*dZ+G dd dZ,de-de-de	ee- ee- f fddZ.de-de	ee- ee- ee- f fddZ/de-de-dee fddZ0d e-dee fd!d"Z1de-d#ee- de2fd$d%Z3d&ee- de2fd'd(Z4	)dAde-d*e,d+ee- ddfd,d-Z5d.ee d*e,ddfd/d0Z6					dBd1e-d2ee- d3ee d&eee-  d4eee-  d5ee- de,fd6d7Z7	dCd*e,d8e2de,fd9d:Z8e*e+ddde ddfde-d;e9d<e9d&eee-  d=eee-  d2ee- d>ed3ee d5ee- de	ee- ee- f fd?d@Z:dS )DzC
Functions dedicated to website navigation and crawling/spidering.
    N)ConfigParser)sleep)ListOptionalTuple)RobotFileParser)UrlStoreextract_linksfix_relative_urlsget_base_urlis_navigation_pageis_not_crawlable)XPathtostring   )baselineprune_unwanted_nodes)Responsefetch_response	fetch_url)DEFAULT_CONFIG)LANGID_FLAGdecode_file	load_htmlF)
compressedstrictz/robots.txt
   i c                   @   s   e Zd ZdZg dZ			ddedee dee dee ddf
d	d
ZdedefddZ	dedefddZ
deddfddZdeee  dee fddZdedefddZdS )CrawlParametersz6Store necessary information to manage a focused crawl.)	startbaselangrulesrefi	known_numis_onprune_xpathNr   r    r!   r&   returnc                 C   sP   || _ | || _| || _|| _|pt| j| _d| _d| _	d| _
|| _d S )Nr   T)r   _get_base_urlr   _get_referencer"   r    	get_rulesr!   r#   r$   r%   r&   )selfr   r    r!   r&    r,   S/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/spider.py__init__0   s   
zCrawlParameters.__init__c                 C   s   t |}|std| |S )z#Set reference domain for the crawl.zcannot start crawl: )r   
ValueError)r+   r   r   r,   r,   r-   r(   A   s   zCrawlParameters._get_base_urlc                 C   s"   | ddkr|ddd S |S )zDetermine the reference URL./   r   r   )countrsplit)r+   r   r,   r,   r-   r)   H   s   "zCrawlParameters._get_reference	url_storec                 C   s(   t || j| _t|| j| _dS )z*Adjust crawl data based on URL store info.N)boolfind_unvisited_urlsr   r%   lenfind_known_urlsr$   )r+   r4   r,   r,   r-   update_metadataL   s   zCrawlParameters.update_metadatatodoc                    s   |sg S  fdd|D S )z.Prepare the todo list, excluding invalid URLs.c                    s$   g | ]}| j kr j|v r|qS r,   )r   r"   ).0ur+   r,   r-   
<listcomp>U   s   $ z/CrawlParameters.filter_list.<locals>.<listcomp>r,   )r+   r:   r,   r=   r-   filter_listQ   s   zCrawlParameters.filter_listlinkc                 C   s*   | j  s| j d|o| j|v ot| S )z9Run checks: robots.txt rules, URL type and crawl breadth.*)r!   	can_fetchr"   r   )r+   r@   r,   r,   r-   is_valid_linkW   s
   zCrawlParameters.is_valid_linkNNN)__name__
__module____qualname____doc__	__slots__strr   r   r.   r(   r)   r   r9   r   r?   r5   rC   r,   r,   r,   r-   r   ,   s,    
r   
htmlstringhomepager'   c                 C   s   d| vrd| vr| |fS t | }|du r| |fS |d}|r#|d nd}|r+d|vr5td| | |fS |dd	   d
d}|dsRt	|}t
||}t|}|du rbtd| dS td| ||fS )z:Check if there could be a redirection by meta-refresh tag.z	"refresh"z	"REFRESH"Nz@.//meta[@http-equiv="refresh" or @http-equiv="REFRESH"]/@contentr    ;zno redirect found: %sr   zurl=httpzfailed redirect: %s)NNzsuccessful redirect: %s)r   xpathlogginginfosplitstriplowerreplace
startswithr   r
   r   warning)rK   rL   	html_treeresultsresulturl2base_urlnewhtmlstringr,   r,   r-   refresh_detection`   s,   

r_   c                 C   sz   t | dd}|r|jsdS |j| dfvrtd|j |j} t|j}t|| \}}|du r0dS td| ||t|fS )zBCheck if the homepage is redirected and return appropriate values.FdecoderD   r0   zfollowed homepage redirect: %sNzfetching homepage OK: %s)	r   dataurlrQ   rR   r   r_   debugr   )rL   responserK   new_htmlstringnew_homepager,   r,   r-   probe_alternative_homepage   s   

rh   
robots_urlrb   c              
   C   sX   t  }||  z
||  W |S  ty+ } ztd| W Y d}~dS d}~ww )zEParse a robots.txt file with the standard library urllib.robotparser.zcannot read robots.txt: %sN)r   set_urlparse
splitlines	ExceptionLOGGERerror)ri   rb   r!   excr,   r,   r-   parse_robots   s   
rq   r]   c                 C   s"   | t  }t|}|rt||S dS )z?Attempt to fetch and parse robots.txt file for a given website.N)ROBOTS_TXT_URLr   rq   )r]   ri   rb   r,   r,   r-   r*      s   r*   languagec                 C   s8   | r|rt rt| \}}}t|\}}t||kS dS )zRun a baseline extraction and use a language detector to
    check if the content matches the target language.
    Return True if language checks are bypassed.T)r   r   	py3langidclassifyr5   )rK   rs   _textr[   r,   r,   r-   is_target_language   s
   rx   r:   c                 C   s   t dd | D S )z6Probe if there are still navigation URLs in the queue.c                 s   s    | ]}t |V  qd S )N)r   )r;   rc   r,   r,   r-   	<genexpr>   s    z&is_still_navigation.<locals>.<genexpr>)any)r:   r,   r,   r-   is_still_navigation   s   r{   rM   paramsrc   c                 C   s   t | |jsdS | r3|jdur3t|jtr|jg|_t| }|dur3t|dd |jD }t| } g g }}t	| |p>|j
d|jdddD ]}||sNqFt|rX|| qF|| qFtj||d dS )zExamine the HTML code and process the retrieved internal links.
    Extract and filter new internal links after an optional language check.
    Store the links in todo-list while prioritizing the navigation ones.Nc                 S   s   g | ]}t |qS r,   )r   )r;   xr,   r,   r-   r>      s    z!process_links.<locals>.<listcomp>FT)pagecontentrc   external_boolrs   with_navr   )urls
appendleft)rx   r    r&   
isinstancerJ   r   r   r   ra   r	   r   rC   r   append	URL_STOREadd_urls)rK   r|   rc   treelinkslinks_priorityr@   r,   r,   r-   process_links   s0   



r   re   c                 C   s<   | du s| j s	dS tj| jgdd tt| j ||j dS )z2Convert urllib3 response object and extract links.NT)visited)rb   r   r   rc   r   r   r   )re   r|   r,   r,   r-   process_response   s   r   r   r    r!   knownr&   c                 C   sv   t | |||}tj|pg dd tj||d t|j|j |s4tj|jgdd t|dd}|S |	t |S )zInitialize crawl by setting variables, copying values to the
    URL store and retrieving the initial page if the crawl starts.T)r   r   )r   F)initial)
r   r   r   r?   store_rulesr   r!   r   
crawl_pager9   )r   r    r!   r:   r   r&   r|   r,   r,   r-   
init_crawl   s   

r   r   c                 C   s   t | j}|sd| _tt | j| _| S |  jd7  _|r:t|\}}}|r9|r9|r9t 	|g t
|| |d nt|dd}t||  | t  | S )z6Examine a webpage, extract navigation links and links.Fr   )rc   r`   )r   get_urlr   r%   r7   r8   r$   r#   rh   r   r   r   r   r9   )r|   r   rc   rK   rL   new_base_urlre   r,   r,   r-   r     s    

r   max_seen_urlsmax_known_urlsknown_linksconfigc	                 C   s   t | |||||}	tj|	j|ddd}
|	jr7|	j|k r7|	j|k r7t|	}	t	|
 |	jr7|	j|k r7|	j|k s"t
tt|	j}t
tt|	j}||fS )a  Basic crawler targeting pages of interest within a website.

    Args:
        homepage: URL of the page to first page to fetch, preferably the homepage of a website.
        max_seen_urls: maximum number of pages to visit, stop iterations at this number or at the exhaustion of pages on the website, whichever comes first.
        max_known_urls: stop if the total number of pages "known" exceeds this number.
        todo: provide a previously generated list of pages to visit / crawl frontier.
        known_links: provide a list of previously known pages.
        lang: try to target links according to language heuristics.
        config: use a different configuration (configparser format).
        rules: provide politeness rules (urllib.robotparser.RobotFileParser() format).
        prune_xpath: remove unwanted elements from the HTML pages using XPath.

    Returns:
        List of pages to visit, deque format, possibly empty if there are no further pages to visit.
        Set of known links.

    DEFAULT
SLEEP_TIME)default)r   r   get_crawl_delayr   getfloatr%   r#   r$   r   r   listdictfromkeysr6   r8   )rL   r   r   r:   r   r    r   r!   r&   r|   
sleep_timer,   r,   r-   focused_crawler2  s   r   )rM   )NNNNN)F);rH   rQ   configparserr   timer   typingr   r   r   urllib.robotparserr   courlanr   r	   r
   r   r   r   rt   ImportError
lxml.etreer   r   corer   r   	downloadsr   r   r   settingsr   utilsr   r   r   	getLoggerrE   rn   r   rr   MAX_SEEN_URLSMAX_KNOWN_URLSr   rJ   r_   rh   rq   r*   r5   rx   r{   r   r   r   r   intr   r,   r,   r,   r-   <module>   s    	
4
&

&




!

	
