o
    Dh%                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ eeZh dZedZedZ edZ!edZ"edZ#edZ$G dd dZ%de&de'fddZ(de	e& de%de	e& fddZ)de&de%de	e& fddZ*de&de%de	e& fdd Z+d!e&de%de	e& fd"d#Z,de%d$e
e& de	e& fd%d&Z-		'	(d2d)e&d*e
e& d+e'd,e.de	e& f
d-d.Z/d/e&d*e
e& de	e& fd0d1Z0dS )3z>
Examining feeds and extracting links for further processing.
    N)islice)sleep)ListOptional)	check_url	clean_urlfilter_urlsfix_relative_urlsget_hostinfois_valid_url   )is_similar_domain)	fetch_url)	MAX_LINKS)	load_html>   text/rdftext/rsstext/xml	text/atom
text/plaintext/rdf+xmltext/rss+xmltext/atom+xmlapplication/rdfapplication/rssapplication/xmlapplication/atomapplication/jsonapplication/rdf+xmlapplication/rss+xmlapplication/atom+xmlapplication/feed+jsonapplication/x-atom+xmlapplication/x.atom+xmlz<(feed|rss|\?xml)z<link .*?href=".+?"zhref="(.+?)"z:<link>(?:\s*)(?:<!\[CDATA\[)?(.+?)(?:\]\]>)?(?:\s*)</link>z\bcomments\bzn\.(?:atom|rdf|rss|xml)$|\b(?:atom|rss)\b|\?type=100$|feeds/posts/default/?$|\?feed=(?:atom|rdf|rss|rss2)|feed$c                   @   sD   e Zd ZdZg dZ		ddedededed	ee d
dfddZdS )FeedParametersz.Store necessary information to proceed a feed.basedomainextlangrefFNbaseurlr'   	referenceexternaltarget_langreturnc                 C   s"   || _ || _|| _|| _|| _d S Nr%   )selfr+   r'   r,   r-   r.    r2   R/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/feeds.py__init__M   s
   
zFeedParameters.__init__)FN)	__name__
__module____qualname____doc__	__slots__strboolr   r4   r2   r2   r2   r3   r$   I   s$    r$   feed_stringr/   c                 C   s*   t | rdS | dd }d|v pd|v S )z$Check if the string could be a feed.TNd   z<rssz<feed)FEED_OPENINGmatch)r<   	beginningr2   r2   r3   is_potential_feed\   s   
rA   linklistparamsc                 C   s   g }t t| D ]B}t|j|}t||jd}|dur=|js5d|vr5t|j|d s5t	
d|j|d  q||d  qd|v sEd|v rJ|| q|S )	zGExamine links to determine if they are valid and
    lead to a web page)languageNfeedr   z'Rejected, diverging domain names: %s %sr   
feedburner	feedproxy)sortedsetr	   r&   r   r)   r(   r   r'   LOGGERwarningappend)rB   rC   output_linksitemlinkcheckedr2   r2   r3   handle_link_listd   s$   
rQ   c                 C   s   t | s<| dr3zdd t| dg D }dd |D W S  tjjy2   td|j	 Y g S w td|j	 g S d| v rRd	d d
d t
t| tD D S d| v redd t
t| tjtD S g S )z<Try different feed types and return the corresponding links.{c                 S   s    g | ]}| d p| dqS )urlid)get).0rN   r2   r2   r3   
<listcomp>   s    zfind_links.<locals>.<listcomp>itemsc                 S   s   g | ]}|d ur|qS r0   r2   )rV   cr2   r2   r3   rW      s    zJSON decoding error: %szPossibly invalid feed: %sz<link c                 S   s*   g | ]}d |vrd|vrt |d qS )zatom+xmlz
rel="self"r   )	LINK_HREFsearchrV   rO   r2   r2   r3   rW      s
    c                 s   s    | ]}|d  V  qdS )r   Nr2   rV   mr2   r2   r3   	<genexpr>   s    
zfind_links.<locals>.<genexpr>z<link>c                 S   s   g | ]}|d    qS )r   )stripr]   r2   r2   r3   rW      s    
)rA   
startswithjsonloadsrU   decoderJSONDecodeErrorrJ   debugr'   r   
LINK_ATTRSfinditerr   LINK_ELEMENTSreDOTALL)r<   rC   
candidatesr2   r2   r3   
find_links   s0   
rm   c                    sl   | st d j g S t|   } fddt| D }|r-t dt|t| |S t d j |S )z7Extract and refine links from Atom, RSS and JSON feeds.zEmpty feed: %sc                    s(   g | ]}| j kr|d dkr|qS )/   )r*   countr\   rC   r2   r3   rW      s
    z!extract_links.<locals>.<listcomp>z!Links found: %s of which %s validzInvalid feed for %s)rJ   rf   r'   rm   r`   rQ   len)r<   rC   
feed_linksrM   r2   rq   r3   extract_links   s   
rt   
htmlstringc                 C   s   t | }|du rtd|j g S dd |dD }|s'dd |dD }g }t|D ]!}t|j|}t|}|rO||j	krOt
|rOt|sO|| q.tdt|t| |S )	zxParse the HTML and try to extract feed URLs from the home page.
    Adapted from http://www.aaronsw.com/2002/feedfinder/NzInvalid HTML/Feed page: %sc                 S   s8   g | ]}| d tv st| ddr| ddqS )typehref )rU   
FEED_TYPESLINK_VALIDATION_REr[   r\   r2   r2   r3   rW      s    
z"determine_feed.<locals>.<listcomp>z//link[@rel="alternate"][@href]c                 S   s*   g | ]}t |d dr|d dqS )rw   rx   )rz   r[   rU   r\   r2   r2   r3   rW      s    
z
//a[@href]z%Feed URLs found: %s of which %s valid)r   rJ   rf   r&   xpathdictfromkeysr	   r   r*   r   	BLACKLISTr[   rL   rr   )ru   rC   tree	feed_urlsoutput_urlsrO   r2   r2   r3   determine_feed   s8   

r   	urlfilterc                 C   sT   | j r(td| j d| j  d}|r(t|| }t||}tdt|| j |S g S )z2Alternative way to gather feed links: Google News.z*https://news.google.com/rss/search?q=site:z&hl=z&scoring=n&num=100z!%s Google news links found for %s)r)   r   r'   rt   r   rJ   rf   rr   )rC   r   
downloadedrs   r2   r2   r3   probe_gnews   s   

r   F       @rS   r.   r-   
sleep_timec                 C   s  t | \}}|du rtd|  g S t||| ||}d}t| }|durft||}	|	sMt||D ]}
t|
}|r@|	t|| q0t| t|d krM| }|	r_t	|	|}	t
dt|	| |	S t
d|  ntd|  | d|kr|t| t||S t||S )a  Try to find feed URLs.

    Args:
        url: Webpage or feed URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
                     (two-letter string, ISO 639-1 format).
        external: Similar hosts only or external URLs
                  (boolean, defaults to False).
        sleep_time: Wait between requests on the same website.

    Returns:
        The extracted links as a list (sorted list of unique links).

    NzInvalid URL: %sro   z%s feed links found for %szNo usable feed links found: %szCould not download web page: %srn   )r
   rJ   rK   r$   r   rt   r   extendrr   r   rf   errorr`   r   try_homepager   )rS   r.   r-   r   r'   r+   rC   r   r   rs   rE   r<   r2   r2   r3   find_feed_urls   s6   



r   r+   c                 C   s   t d|  t| |S )zhShift into reverse and try the homepage instead of the particular feed
    page that was given as input.z&Probing homepage for feeds instead: %s)rJ   rf   r   )r+   r.   r2   r2   r3   r   4  s   
r   )NFr   )1r8   rb   loggingrj   	itertoolsr   timer   typingr   r   courlanr   r   r   r	   r
   r   deduplicationr   	downloadsr   settingsr   utilsr   	getLoggerr5   rJ   ry   compiler>   rg   rZ   ri   r~   rz   r$   r:   r;   rA   rQ   rm   rt   r   r   floatr   r   r2   r2   r2   r3   <module>   s\     	





(,
"9