o
    Dh2                  	   @   sl  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZmZ h dZh d	Zh d
ZedejZedejZedZedejZedejZedejZejdejdZedZejdejdZedZ dZ!edejZ"edejZ#de"fde#fgZ$ejdejdZ%edZ&edZ'edZ(edZ)ed Z*ejd!ejdZ+ed"Z,ejd#ejdZ-ejd$ej.dZ/d?d%ed&ed'e	e0 d(e1fd)d*Z2d+ed%ed(efd,d-Z3d.eee ee0e0f f d%ed(efd/d0Z4d1e0d2e
e0 d(e	e0 fd3d4Z5d5e0d%ed(efd6d7Z6d8e0d(e0fd9d:Z7d;e	e0 d<e0d(e	e0 fd=d>Z8dS )@z
Functions needed to scrape metadata from JSON-LD format.
For reference, here is the list of all JSON-LD types: https://schema.org/docs/full.html
    N)unescape)AnyDictListOptionalPatternUnion   )Document)HTML_STRIP_TAGStrim>
   articleblogpostingnewsarticleliveblogpostingscholarlyarticleopinionnewsarticlesocialmediapostingreportagenewsarticlebackgroundnewsarticlemedicalscholarlyarticle>!   blogqapagereportr   faqpagewebpagewebsiteitempage	aboutpage
jobpostingr   contactpager   profilepagetecharticlecheckoutpagecollectionpagemedicalwebpager   satiricalarticler   realestatelistingreviewnewsarticlesearchresultspager   r   analysisnewsarticleaskpublicnewsarticler   r   discussionforumpostingr   advertisercontentarticle>   r   r   organizationnewsmediaorganizationzM"author":[^}[]+?"name?\\?": ?\\?"([^"\\]+)|"author"[^}[]+?"names?".+?"([^"]+)z$"[Pp]erson"[^}]+?"names?".+?"([^"]+)z`,?(?:"\w+":?[:|,\[])?{?"@type":"(?:[Ii]mageObject|[Oo]rganization|[Ww]eb[Pp]age)",[^}[]+}[\]|}]?z,"publisher":[^}]+?"name?\\?": ?\\?"([^"\\]+)z"@type"\s*:\s*"([^"]*)"z"articleSection": ?"([^"\\]+)z"author":|"person":)flagsz<[^>]+>z^https?://schema\.orgz\\u([0-9a-fA-F]{4}))	givenNameadditionalName
familyNamez*"@type":"[Aa]rticle", ?"name": ?"([^"\\]+)z"headline": ?"([^"\\]+)z"name"z
"headline"uB   ^([a-zäöüß]+(ed|t))? ?(written by|words by|words|by|von|from) z\d.+?$z@[\w]+z[._+]u$   ["‘({\[’\'][^"]+?[‘’"\')\]}]u   [^\w]+$|[:()?*$#!%/<>{}~¿]u;   \b\s+(am|on|for|at|in|to|from|of|via|with|—|-|–)\s+(.*)z3\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\bz"/|;|,|\||&|(?:^|\W)[u|a]nd(?:$|\W)u>   [✀-➾😀-🙏☀-⛿🌀-🗿🤀-🧿🩰-🫿🚀-🛿]+metadata	candidatecontent_typereturnc                 C   sV   |r)t |tr)| jrt| jt|k r|dkrdS | jr)| jdr)|ds)dS dS )z6Determine if the candidate should be used as sitename.r   ThttpF)
isinstancestrsitenamelen
startswith)r4   r5   r6    r>   Z/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/json_metadata.pyis_plausible_sitename9   s    r@   parentc              	      s  t d| D ]@}d|v rd|d v r|d d |_d|vs!|d s"qt|d tr/|d d n|d }| }|tv rC|jsCt||_|tv r`|	dpU|	dpU|	d}t
|||r_||_q|dkrz|	dry|d d	syt|j|d |_q|tv rFd
|v r|d
 }t|trzt|}W n tjy   t|j||_Y nw t|ts|g}|D ]U d vs d dkrd}d v r 	d}t|trd|d}n"t|trd|v r|d }nd v rd v rd fddtD }t|trt|j||_q|js(d|v r(t|d tr|d g|_n
tt d|d |_|jsFd|v r<|dkr<|d |_qd|v rF|d |_q|S )z3Find and extract selected metadata from JSON parts.N	publishername@typer   	legalNamealternateNamepersonr8   authorPerson; r1   r3    c                 3   s     | ]}| v r | V  qd S Nr>   ).0xrH   r>   r?   	<genexpr>x   s    z!process_parent.<locals>.<genexpr>articleSectionr   headline)filterr;   r9   listlowerJSON_OGTYPE_SCHEMApagetypenormalize_jsonJSON_PUBLISHER_SCHEMAgetr@   r=   normalize_authorsrH   JSON_ARTICLE_SCHEMAr:   jsonloadsJSONDecodeErrorjoinstripdictAUTHOR_ATTRS
categoriestitle)rA   r4   contentr6   r5   list_authorsauthor_namer>   rO   r?   process_parentC   sn   "








ri   schemac                 C   s   t | tr| g} | D ]W}|d}|rat |trat|rad|v r2t |d tr,|d n|d g}n*d|v rZt |d trZd|d  v rZd|v rZt |d trT|d n|d g}n| }t||}q
|S )z,Parse and extract metadata from JSON-LD dataz@contextz@graphrD   r   liveBlogUpdate)	r9   rb   rZ   r:   JSON_SCHEMA_ORGmatchrT   rU   ri   )rj   r4   rA   contextr>   r>   r?   extract_json   s   

"."
ro   elemtextregular_expressionc                 C   s^   d}| | }|r+d|d v r+t||d }|jd| dd} | | }|r+d|d v s|p.dS )z.Crudely extract author names from JSON-LD dataNrK   r	    )count)searchr[   sub)rp   rq   authorsmymatchr>   r>   r?   extract_json_author   s   

rx   elemc                 C   s  t d| }t|tpt|t}|r||_d| v r/t| }|r/t|d 	 }|t
v r/||_d| v rNt| }|rNd|d vrNt|d }t||rN||_d| v rat| }|rat|d g|_tD ]\}}|| v r|js|| }|rt|d |_ |S qc|S )z*Crudely extract metadata from JSON-LD datarr   rD   r	   z"publisher",z"articleSection")JSON_AUTHOR_REMOVEru   rx   JSON_AUTHOR_1JSON_AUTHOR_2rH   	JSON_TYPErt   rX   rU   rV   rW   JSON_PUBLISHERr@   r;   JSON_CATEGORYrd   JSON_SEQre   )ry   r4   element_text_authorrH   rw   r5   keyregexr>   r>   r?   extract_json_parse_error   s>   





 r   stringc                 C   s`   d| v r(|  dd dd dd} tdd | } ddd	 | D } t| } ttd| S )
z-Normalize unicode strings and trim the output\z\nrr   z\rz\tc                 S   s   t t| d dS )Nr	      )chrint)rm   r>   r>   r?   <lambda>   s    z normalize_json.<locals>.<lambda>c                 s   s,    | ]}t |d k st |dkr|V  qdS )i   i  N)ordrM   cr>   r>   r?   rP      s   * z!normalize_json.<locals>.<genexpr>)replaceJSON_UNICODE_REPLACEru   r`   r   r   JSON_REMOVE_HTML)r   r>   r>   r?   rX      s   rX   current_authorsauthor_stringc                    s  g }|  dst|r| S | dur| d}d|v r$| d}d|v s,d|v r0t|}t	d|}t
|D ]x t  t	d  t	d  tt	d	  t	d  t	d  t	d  t	d  t	d   rt d
krd	 vrd vrq; d  rtdd  D dk r    |vrt|dkst fdd|D r|  q;t|dkr| S d|dS )z3Normalize author info to focus on author names onlyr8   NrJ   z\uunicode_escapez&#z&amp;rr   rK   2   -r   c                 s   s    | ]	}|  rd V  qdS )r	   N)isupperr   r>   r>   r?   rP     s    z$normalize_authors.<locals>.<genexpr>r	   c                 3   s    | ]}| vV  qd S rL   r>   )rM   
new_authorrO   r>   r?   rP     s    )rU   r=   AUTHOR_EMAILrm   splitencodedecoder   r   ru   AUTHOR_SPLITr   AUTHOR_EMOJI_REMOVEAUTHOR_TWITTERAUTHOR_REPLACE_JOINAUTHOR_REMOVE_NICKNAMEAUTHOR_REMOVE_SPECIALAUTHOR_PREFIXAUTHOR_REMOVE_NUMBERSAUTHOR_REMOVE_PREPOSITIONr<   r   sumre   allappendr`   ra   )r   r   new_authorsr>   rO   r?   r[      s<   
 "*
r[   rL   )9__doc__r]   rehtmlr   typingr   r   r   r   r   r   settingsr
   utilsr   r   r\   rV   rY   compileDOTALLr|   r}   r{   r   r~   r   
IGNORECASE
JSON_MATCHr   rl   r   rc   	JSON_NAMEJSON_HEADLINEr   r   r   r   r   r   r   r   r   r   UNICODEr   r:   boolr@   ri   ro   rx   r   rX   r[   r>   r>   r>   r?   <module>   sX     








 
*J*"
