o
    Dh^K                  	   @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlmZ ddlmZmZ dd	lmZmZ eeZed
ZdedefddZh dZddhZh dZh dZh dZh dZ edej!edej!edej!edej!edej!edej!dZ"ddhZ#dd hZ$d!ede%fd"d#Z&G d$d% d%Z'G d&d' d'Z(ed(ej!ed)ej!d*Z)ed+ej!Z*d,ede+fd-d.Z,i fded/ede+fd0d1Z-dS )2a  Minimalistic fork of readability-lxml code

This is a python port of a ruby port of arc90's readability project

http://lab.arc90.com/experiments/readability/

Given a html document, it pulls out the main body text and cleans it up.

Ruby port by starrhorne and iterationlabs
Python port by gfxmonk

For list of contributors see
https://github.com/timbertson/python-readability
https://github.com/buriy/python-readability

License of forked code: Apache-2.0.
    N)sqrt)
attrgetter)AnyDictOptionalSet)tostring)HtmlElementfragment_fromstring   )	load_htmltrimz\.( |$)stringreturnc                 C   s   t | tddS )Nxml)encodingmethod)r   str)r    r   ]/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/readability_lxml.py	_tostring&   s   r   >
   apdloluldivimgpretable
blockquoter   article>   tdr   r    >	   ddr   dtlir   r   formasideaddress>
   h1h2h3h4h5h6thnavfooterheader>   r   r   r%   r   embedinputzcombx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitterz#and|article|body|column|main|shadowzKarticle|body|content|entry|hentry|main|page|pagination|post|text|blog|storyzbutton|combx|comment|com-|contact|figure|foot|footer|footnote|form|input|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widgetz.<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)z+https?:\/\/(?:www\.)?(?:youtube|vimeo)\.com)unlikelyCandidatesReokMaybeItsACandidateRe
positiveRe
negativeRedivToPElementsRevideoRebodyhtmlr   r   elemc                 C   s   t t|  S )z7Return the length of the element with all its contents.)lenr   text_contentr=   r   r   r   text_lengthW   s   rA   c                   @   s.   e Zd ZdZddgZdededdfddZdS )	Candidatez,Defines a class to score candidate elements.scorer=   r   Nc                 C   s   || _ || _d S N)rC   r=   )selfrC   r=   r   r   r   __init__a   s   
zCandidate.__init__)__name__
__module____qualname____doc__	__slots__floatr	   rF   r   r   r   r   rB   \   s    rB   c                	   @   s   e Zd ZdZg dZd$dedededd	fd
dZdefddZ	de
eef dedefddZde
eef dee fddZdedefddZde
eef fddZdedefddZdedefddZd%ddZd%dd Zd!ede
eef defd"d#Zd	S )&Documentz,Class to build a etree document out of html.docmin_text_lengthretry_length      rO   rP   rQ   r   Nc                 C   s   || _ || _|| _dS )a  Generate the document

        :param doc: string of the html content.
        :param min_text_length: Set to a higher value for more precise detection of longer texts.
        :param retry_length: Set to a lower value for better detection of very small texts.

        The Document class is not re-enterable.
        It is designed to create a new Document() for each HTML file to process it.

        API method:
        .summary() -- cleaned up content
        NrN   )rE   rO   rP   rQ   r   r   r   rF   k   s   
zDocument.__init__c           	      C   s   | j ddD ]}|  qd}	 |r|   |   |  }| |}|r-| ||}n |du r9d}t	d qt	d | j 
d}|durJ|n| j }| ||}t|pWd	}|rc|| jk rcd}q|S )
z
        Given a HTML file, extracts the text of the article.

        Warning: It mutates internal DOM representation of the HTML document,
        so it is better to call other API methods before this one.
        scriptstyleTFz5Ended up stripping too much - going for a safer parsez=Ruthless and lenient parsing did not work. Returning raw htmlr;   N )rO   iter	drop_treeremove_unlikely_candidates&transform_misused_divs_into_paragraphsscore_paragraphsselect_best_candidateget_articleLOGGERdebugfindsanitizer>   rQ   )	rE   r=   ruthless
candidatesbest_candidater!   r;   cleaned_articlearticle_lengthr   r   r   summary|   s8   

zDocument.summaryrc   rd   c                 C   s   t d|jd }td}|j }|d urt|n|jg}|D ]H}d}||jks3||v r6|| j|kr6d}n*|jdkr`| |}	|jpDd}
t	|
}|dkrQ|	d	k s^|dkr`|	d
kr`t
|
r`d}|rg|| q|S )N
   皙?z<div/>FTr   rV   P   g      ?r   )maxrC   r
   r=   	getparentlisttagget_link_densitytextr>   	DOT_SPACEsearchappend)rE   rc   rd   sibling_score_thresholdoutputparentsiblingssiblingrs   link_densitynode_contentnode_lengthr   r   r   r]      s2   





zDocument.get_articlec                 C   s\   |sd S t | tddd}ttjr(|d d D ]}td|jj	|j
 qtt|S )NrC   T)keyreverse   zTop 5: %s %s)sortedvaluesr   r^   isEnabledForloggingDEBUGr_   r=   rn   rC   nextrW   )rE   rc   sorted_candidates	candidater   r   r   r\      s   zDocument.select_best_candidater=   c                 C   s,   t |pd}tdd |dD }|| S )Nr   c                 s   s    | ]}t |V  qd S rD   )rA   ).0linkr   r   r   	<genexpr>       z,Document.get_link_density.<locals>.<genexpr>z.//a)rA   sumfindall)rE   r=   total_lengthlink_lengthr   r   r   ro      s   zDocument.get_link_densityc           
      C   s   i }| j dddD ]]}| }|d u rq
| }t| }t|}|| jk r)q
||fD ]}|d ur>||vr>| |||< q-dt|d t	|d d }||  j
|7  _
|d urg||  j
|d 7  _
q
| D ]\}}	|	 j
d| | 9  _
ql|S )	Nr   r   r"   r   ,d         )rO   rW   rl   r   r?   r>   rP   
score_nodesplitminrC   itemsro   )
rE   rc   r=   parent_nodegrand_parent_node	elem_textelem_text_lennoderC   r   r   r   r   r[      s,   
 zDocument.score_paragraphsc                 C   sT   d}t d |d|dfD ]}td |r|d8 }td |r'|d7 }q|S )Nr   classidr8   rR   r7   )filtergetREGEXESrr   )rE   r=   weight	attributer   r   r   class_weight  s   zDocument.class_weightc                 C   sl   |  |}t|j}| }|tv r|d7 }n|tv r |d7 }n|tv r)|d8 }n|tv r1|d8 }t||S )Nr~   r   )	r   r   rn   lower
DIV_SCORESBLOCK_SCORESBAD_ELEM_SCORESSTRUCTURE_SCORESrB   )rE   r=   rC   rn   namer   r   r   r     s   





zDocument.score_nodec              	   C   sr   | j dD ]0}dtd |d|df}t|dk rq|jtvr6td 	|r6td 	|s6|
  qd S )Nz.//* r   r   r   r5   r6   )rO   r   joinr   r   r>   rn   
FRAME_TAGSr   rr   rX   )rE   r=   attrsr   r   r   rY     s    
z#Document.remove_unlikely_candidatesc              	   C   s   | j dD ]}td dttt|sd|_q| j dD ]N}|j	r=|j	
 r=td}|j	d |_	|_	|d| tt|ddD ])\}}|jre|j
 retd}|jd |_	|_||d	 | |jd
krn|  qEq!d S )Nz.//divr9   rV   r   z<p/>r   T)r}   r   br)rO   r   r   rr   r   mapr   rm   rn   rp   stripr
   insertr   	enumeratetailrX   )rE   r=   p_elemposchildr   r   r   rZ   )  s*   	
z/Document.transform_misused_divs_into_paragraphsr   c              	      s  | ddddddD ]}| |dk s| |dkr|  q
| d	d
D ]    q%| dD ] d jv rFtd  jd rFd _q1   q1t }t	|
dD ]}  |v r]qU|  } |v rk|  jnd}|| dk rtd j||    qU  ddk rd} fddtD }|d  d8  < |d  t d8  < t }	|  }
  }|d ur||v r|| jnd}|d r|d d|d d  krd|d  d}n|d |d kr jtvrd }n|d |d d! krd"}n|	| jk r|d dkrd#|	 d$}n|	| jk r%|d d%kr%d#|	 d&}n|d'k r9|
d(kr9d)|
d*d+| }n|d'krM|
d,krMd)|
d*d+| }nr|d- dkrY|	d.k s`|d- dkrcd/}n\|	sd0}g }  D ]}t|}|r~||  nqnt|d } jdd1D ]}t|}|r|| t||kr nq|rt|d2krd3}|  d4d5d6d7 nd3}|rӈ   td8| j||pd9 qU|| _t| jS ):Nr)   r*   r+   r,   r-   r.   r   gQ?r&   textareaiframesrcr:   VIDEOz6//table|//ul|//div|//aside|//header|//footer|//sectionz+Removed %s with score %6.3f and weight %-3sr   rh   Tc                    s"   i | ]}|t  d | qS )z.//)r>   r   )r   kindr@   r   r   
<dictcomp>g  s    z%Document.sanitize.<locals>.<dictcomp>r%   r   r4   z.//input[@type="hidden"]r   r   r   g?ztoo many images ()zmore <li>s than <p>sr   zless than 3x <p>s than <input>sztoo short content length z without a single imager   z and too many imagesrR   ri   ztoo many links z.3fz for its weight g      ?r3   K   z<<embed>s with too short content length, or too many <embed>sz
no content)	precedingi  Fr   r   r   sectionz0Removed %6.3f %s with weight %s cause it has %s.rV   )rW   r   ro   rX   attribr   rr   rp   setreversedxpathrC   r^   r_   rn   r?   countTEXT_CLEAN_ELEMSr>   r   rA   rl   	LIST_TAGSrP   itersiblingsrs   r   updaterO   r   )rE   r   rc   r2   allowedr   rC   	to_removecountscontent_lengthry   r   reasonrw   sibsib_content_lengthlimitr   r@   r   ra   F  s   







 



zDocument.sanitize)rR   rS   )r   N)rG   rH   rI   rJ   rK   r	   intrF   r   rg   r   rB   r]   r   r\   rL   ro   r[   r   r   rY   rZ   ra   r   r   r   r   rM   f   s    ,)$	

"rM   z-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remotez+and|article|body|column|content|main|shadow)unlikelyCandidatesokMaybeItsACandidatezdisplay:\s*noner   c                 C   sT   d| j v rt| ddrdS d| j v rdS | ddkr(d| ddvr(dS d	S )
zT
    Checks if the node is visible by considering style, attributes, and class.
    rU   rV   Fhiddenzaria-hiddentruezfallback-imager   T)r   DISPLAY_NONErr   r   )r   r   r   r   is_node_visible  s   
r   optionsc                 C   s   t | }|du r
dS |dd}|dd}|dt}t|d}|d	d
 |dD  d}|D ]G}||s;q4|dd d|dd }	td |	rYtd |	sYq4|dr_q4t|	 
 }
|
|k rlq4|t|
| 7 }||kr{ dS q4dS )z]
    Decides whether or not the document is reader-able without parsing the whole thing.
    NFmin_content_length   	min_score   visibility_checkerz.//p | .//pre | .//articlec                 s   s    | ]}|  V  qd S rD   )rl   )r   r   r   r   r   r     r   z)is_probably_readerable.<locals>.<genexpr>z	.//div/brg        r   rV   r   r   r   r   z./parent::li/pT)r   r   r   r   r   r   REGEXPSrr   r>   r?   r   r   )r<   r   rO   r   r   r   nodesrC   r   class_and_idtext_content_lengthr   r   r   is_probably_readerable  s:   
r   ).rJ   r   remathr   operatorr   typingr   r   r   r   
lxml.etreer   	lxml.htmlr	   r
   utilsr   r   	getLoggerrG   r^   compilerq   r   r   DIV_TO_P_ELEMSr   r   r   r   r   Ir   r   r   r   rA   rB   rM   r   r   boolr   r   r   r   r   r   <module>   sl   


  [
