o
    DhJ                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZmZ d
dlmZ d
dlm Z m!Z!m"Z"m#Z# d
dl$m%Z%m&Z& d
dl'm(Z(m)Z)m*Z*m+Z+ d
dl,m-Z-m.Z.m/Z/m0Z0m1Z1 dgZ2e3e4Z5e3d6ej7 e8dZ9e8dZ:e8dZ;e8dZ<e8dZ=e8dej>Z?h dZ@h dZAh dZBh dZCh dZDddhZEh dZFd d!hZGd"d#hZHh d$ZId%d&d'd(d(d(d)d*ZJd+d,hZKg d-ZLd.eMd/eMfd0d1ZNd2eMd3eeM d/eeM fd4d5ZOd6ed7e%d/e%fd8d9ZPd6ed/e
eMeeM f fd:d;ZQd6ed/e%fd<d=ZR	>d^d6ed?ee d@eSd/eeM fdAdBZTd6ed/eeMeeM eeM f fdCdDZUd6ed/eeM fdEdFZVd6ed/eeM fdGdHZWd_d6edIeeM d/eeM fdJdKZXd6ed/eeM fdLdMZYdNeMd6ed/eeM fdOdPZZd`dRedSe[d/eeM fdTdUZ\d6ed/eeM fdVdWZ]			X	dadYeeeMf dIeeM dZee	 d[e[d3eeeM  d/e%fd\d]Z^dS )bzH
Module bundling all functions needed to scrape metadata from webpages.
    N)deepcopy)unescape)AnyDictListOptionalSetTupleUnion)extract_domainget_base_urlis_valid_urlnormalize_urlvalidate_url)	find_date)XPath)HtmlElementtostring   )prune_unwanted_nodes)extract_jsonextract_json_parse_errornormalize_authorsnormalize_json)Documentset_date_params)HTML_STRIP_TAGSline_processing	load_htmltrim)AUTHOR_DISCARD_XPATHSAUTHOR_XPATHSCATEGORIES_XPATHSTAGS_XPATHSTITLE_XPATHSr   htmldatez$https?://(?:www\.|w[0-9]+\.)?([^/]+)z("(?:\\"|[^"])*")|\su5   ^(.+)?\s+[–•·—|⁄*⋆~‹«<›»>:-]\s+(.+)$z["\']z=/(by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero)/([1-9]\.[0-9])zT(cc|creative commons) (by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero) ?([1-9]\.[0-9])?>   
dc.creator
dc:creatordcsext.authoratc-metaauthordc.creator.autparsely-authordcterms.creatorsailthru.authordcterms.creator.autshareaholic:article_author_namebylauthorauthorscreator	rbauthorscitation_authorarticle:author>   dc.descriptiondc:descriptiondcterms.abstractdcterms.descriptiontwitter:descriptionsailthru.descriptiondescription>
   dc.publisherdc:publisherdcterms.publishersailthru.publisher	copyright	publisher	rbpubnamecitation_journal_titletwitter:sitearticle:publisher>   parsely-tagsdcterms.subjectshareaholic:keywordstagskeywordscitation_keywords>   dc.titledcterms.titleparsely-titletwitter:titlesailthru.titleshareaholic:titletitlerbtitlefb_titleheadlinecitation_title	rbmainurltwitter:url>   twitter:imagetwitter:image:srcimageog:imageog:image:urlog:image:secure_urlr1   r6   rF   zapplication-name>   
http-equivcharsetpropertyrT   r=   sitenamer]   pagetype)zog:titlezog:descriptionzog:site_namer^   r_   r`   zog:typez	og:authorzog:article:author)z.//head//link[@rel="canonical"]z.//head//basez6.//head//link[@rel="alternate"][@hreflang="x-default"]rK   returnc                 C   s6   t t| }|s
dS td|} dtd| dS )z!Remove special characters of tags z, N)r   r   CLEAN_META_TAGSsubjoinfiltersplit)rK   trimmed rn   U/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/metadata.pynormalize_tags   s
   rp   r2   author_blacklistc                    s>   dd  D   fdd|  dD }|rd|dS dS )z:Check if the authors string correspond to expected values.c                 S   s   h | ]}|  qS rn   )lower).0arn   rn   ro   	<setcomp>       z check_authors.<locals>.<setcomp>c                    s$   g | ]}|    vr|  qS rn   )striprr   )rs   r1   rq   rn   ro   
<listcomp>   s
    z!check_authors.<locals>.<listcomp>;z; N)rl   rj   rw   )r2   rq   new_authorsrn   rx   ro   check_authors   s   
r|   treemetadatac              	   C   sd   |  dD ]*}|jsqttd|j}zt|}t||}W q tjy/   t	||}Y qw |S )z,Parse and extract metadata from JSON-LD datazK.//script[@type="application/ld+json" or @type="application/settings+json"]z\1)
xpathtextr   JSON_MINIFYri   jsonloadsr   JSONDecodeErrorr   )r}   r~   elemelement_textschemarn   rn   ro   extract_meta_json   s   
r   c                 C   s   t d}| dD ]6}|d|d}}|r@| s@|tv r(||t| < q
|dkr5t|r5||d< q
|tv r@td||d< q
|S )	zESearch meta tags following the OpenGraph guidelines (https://ogp.me/))rT   r1   urlr=   rd   r]   re   z+.//head/meta[starts-with(@property, "og:")]rc   contentzog:urlr   Nr1   )	dictfromkeysr   getisspaceOG_PROPERTIESr   	OG_AUTHORr   )r}   resultr   property_namer   rn   rn   ro   extract_opengraph   s   
r   c              	      s\  t  t| }t|j|j|j|j|j|j	fr|S g d}}| 
dD ] td dd }|s5q$d jv rv dd }|drHq$|dkrT|t| q$|tv r`t|j||_q$|dkrk|jph||_q$|tv ru|j	ps||_	q$d	 jv rֈ d	d }|tv rt|j||_q$|tv r|jp||_q$|tv r|jp||_q$|tv r|jp||_q$|tv sd
|v r|}q$|dkr|jst|r||_q$|tv r|t| q$d jv r dd }|dkrt|j||_q$|dkr|jp||_q$|dkr|jp||_q$t fddtD r!t dt! ddd  q$|jp'||_||_"|S )z)Search meta tags for relevant informationNz.//head/meta[@content]rg   r   rc   og:zarticle:tagrG   nameztwitter:app:namerZ   itempropr1   r=   rW   c                 3   s    | ]}| j vV  qd S N)attrib)rs   keyr   rn   ro   	<genexpr>0  s    zexamine_meta.<locals>.<genexpr>zunknown attribute: %sFunicode)pretty_printencoding)#r   	from_dictr   allrT   r1   r   r=   rd   r]   iterfindr   ri   r   rw   r   rr   
startswithappendrp   PROPERTY_AUTHORr   METANAME_IMAGEMETANAME_AUTHORMETANAME_TITLEMETANAME_DESCRIPTIONMETANAME_PUBLISHERTWITTER_ATTRSr   METANAME_TAG
EXTRA_METALOGGERdebugr   rK   )r}   r~   rK   backup_sitenamecontent_attrproperty_attr	name_attritemprop_attrrn   r   ro   examine_meta   s   





r      expressions	len_limitc                 C   s|   |D ]9}|| }|D ]!}t d| }|r+dt|  k r#|k r+n q
|    S q
t|dkr;td|t| qdS )zExtract meta information    r   z#more than one invalid result: %s %sN)r   rj   itertextlenr   r   )r}   r   r   
expressionresultsr   r   rn   rn   ro   extract_metainfo>  s    
r   c                 C   sV   d}|  d}|dur!t| }t| }r!||d |d fS td |ddfS )z2Extract text segments out of main <title> element.rg   z.//head//titleNr   r   zno main title found)findr   text_contentHTMLTITLE_REGEXmatchr   r   )r}   rT   title_elementr   rn   rn   ro   examine_title_elementQ  s   


r   c                 C   s   |  d}t|dkrt|d  }|r|S t| tpd}|r"|S t| \}}}||fD ]}|r9d|vr9|  S q-|rB|d  S z| dd  }W |S  ty]   t	
d Y |S w )zExtract the document titlez.//h1r   r   rg   .z.//h2zno h2 title found)findallr   r   r   r   r$   r   r   
IndexErrorr   r   )r}   
h1_resultsrT   firstsecondtrn   rn   ro   extract_title_  s,   
r   c                 C   s.   t t| t}t|tdd}|rtd|}|S )zExtract the document author(s)x   )r   N)r   r   r    r   r!   r   )r}   subtreer1   rn   rn   ro   extract_author{  s
   
r   default_urlc           	      C   s   t D ]}| |}|dur|jdnd}|r nq|rN|drN| dD ]'}|dp3|dp3d}|ds>|d	rMt|jd
 }|rM|| } nq&|r^t|\}}|r\t|nd}|pa|S )z'Extract the URL from the canonical linkNhref/z.//head//meta[@content]r   rc   rg   r   ztwitter:r   )	URL_SELECTORSr   r   r   r   r   r   r   r   )	r}   r   selectorelementr   attrtypebase_urlvalidation_result
parsed_urlrn   rn   ro   extract_url  s&   
r   c                 C   s    t | ^}}tdd |D dS )z=Extract the name of a site from the main title (if it exists)c                 s   s     | ]}|rd |v r|V  qdS )r   Nrn   )rs   partrn   rn   ro   r     s    z#extract_sitename.<locals>.<genexpr>N)r   next)r}   _partsrn   rn   ro   extract_sitename  s   r   metatypec                    s   g }d|  d  | dkrt nt}|D ]}| fdd||D  |r& nq| dkr=|s=|dD ]
}||jd  q2dd	 td
d |D D S )z!Find category and tag informationr   z	[s|ies]?/categoryc                 3   s*    | ]}t  |jd  r| V  qdS )r   N)researchr   r   )rs   r   regexprrn   ro   r     s    
z#extract_catstags.<locals>.<genexpr>zR.//head//meta[@property="article:section" or contains(@name, "subject")][@content]r   c                 S   s   g | ]}|r|qS rn   rn   )rs   rrn   rn   ro   ry     rv   z$extract_catstags.<locals>.<listcomp>c                 s   s    | ]	}|rt |V  qd S r   )r   )rs   xrn   rn   ro   r     s    )r"   r#   extendr   r   r   r   r   )r   r}   r   xpath_expressioncatexprr   rn   r   ro   extract_catstags  s    r   Fr   strictc                 C   sf   t | dd}|rd|d   d|d  S | jr1|r,t| j}|r*|d S dS t| jS dS )	zkProbe a link for identifiable free license cues.
    Parse the href attribute first and then the link text.r   rg   zCC r   r   r   r   N)LICENSE_REGEXr   r   upperr   TEXT_LICENSE_REGEXr   )r   r   r   rn   rn   ro   parse_license_element  s   
r   c                 C   s\   |  dD ]}t|dd}|dur|  S q| dD ]}t|dd}|dur+|  S qdS )z:Search the HTML code for license information and parse it.z.//a[@rel="license"][@href]F)r   Nz[.//footer//a[@href]|.//div[contains(@class, "footer") or contains(@id, "footer")]//a[@href]T)r   r   r   )r}   r   r   rn   rn   ro   extract_license  s   r   Tfilecontentdate_config	extensivec           	   
   C   s  |pt  }|p
t|}t| }|du rt S t|}|jr%d|jvr%d|_zt||}W n tyD } zt	d| W Y d}~nd}~ww |j
sMt||_
|jrY|rYt|j||_|jsat||_|jrm|rmt|j||_|jsvt|||_|jrt|jdd|_|j|d< t|fi ||_|jst||_|jrt|jtr|jd |_nt|jtrt|j|_|jd|_|jrd	|jvr|jd  s|j
 |_n|jrt|j}|r|d
 |_|jstd||_|j std||_ t!||_"|d |_#|$  |S )a  Main process for metadata extraction.

    Args:
        filecontent: HTML code as string or parsed tree.
        default_url: Previously known URL of the downloaded document.
        date_config: Provide extraction parameters to htmldate as dict().
        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.

    Returns:
        A trafilatura.settings.Document containing the extracted metadata information or None.
        The Document class has .as_dict() method that will return a copy as a dict.
    Nr   z%error in JSON metadata extraction: %sT)fastr   r   @r   r   r   tagmax_date)%setr   r   r   r   r1   r   	Exceptionr   warningrT   r   r|   r   r   r   r   hostnamer   daterd   r   
isinstancelistr   strlstripisupperMETA_URLr   
categoriesr   rK   r   licensefiledateclean_and_trim)	r   r   r   r   rq   r}   r~   errmymatchrn   rn   ro   extract_metadata  sn   










r
  )r   r   )F)NNTN)___doc__r   loggingr   copyr   htmlr   typingr   r   r   r   r   r	   r
   courlanr   r   r   r   r   r%   r   
lxml.etreer   	lxml.htmlr   r   htmlprocessingr   json_metadatar   r   r   r   settingsr   r   utilsr   r   r   r   xpathsr    r!   r"   r#   r$   __all__	getLogger__name__r   setLevelWARNINGcompiler  r   r   rh   r   Ir   r   r   r   r   r   METANAME_URLr   r   r   r   r   r   r   r   rp   r|   r   r   r   intr   r   r   r   r   r   r   boolr   r   r
  rn   rn   rn   ro   <module>   s    $



	
	b

 


