o
    DhP9                     @   st  d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZmZ ddl m!Z!m"Z" e#e$Z%ddddddddddddZ&dd e&' D Z(h dZ)dededefddZ*dmdede+defddZ,	 dnded!ee d"e-defd#d$Z.d%ee dee/e/e/ee+ f fd&d'Z0	 dnd(ed)e+d*e-dee-ee+ f fd+d,Z1d(ede-fd-d.Z2	 	 dod/ed0e+d1e-d*e-def
d2d3Z3	4	 dpd5eded6e-d7e-dee f
d8d9Z4d5ededee fd:d;Z5d5eddfd<d=Z6d5eddfd>d?Z7d5eddfd@dAZ8d5eddfdBdCZ9d5eddfdDdEZ:d5eddfdFdGZ;i dHe6dIe6dJe6dKe8dLe8dMe8dNe8dOe8dPe8dQe9dRe9dSe7dTe7dUe7dVe:dWe:dXe:dYe;iZ<d5edZee+ ddfd[d\Z=	dqdeded]ee+ defd^d_Z>dJd`dTdSdadb dQdcdddedb df	Z?dedefdgdhZ@dndiedje-de+fdkdlZAdS )rz*
Functions to process nodes in HTML code.
    N)deepcopy)ListOptionalTuple)fix_relative_urlsget_base_url)_ElementElement
SubElementXPath
strip_tagstostring)HtmlElement   )duplicate_test)Document	ExtractorCUT_EMPTY_ELEMSMANUALLY_CLEANEDMANUALLY_STRIPPED)
textfiltertrimis_image_element)META_ATTRIBUTESdelete_element#iz#bz#uz#tz#subz#sup)emibstrongukbdsampttvarsubsupc                 C   s   i | ]\}}||qS  r'   ).0kvr'   r'   [/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/htmlprocessing.py
<dictcomp>+   s    r,   >   figuresourcepicturetreeoptionsreturnc                 C   s   t  t }}|js|g d n| dD ]}d|_q|jr.dd |D }|d t	| | |j
dkr^| dd	ur^t| }|D ]}| |D ]}t| qLqE| dd	u r]|} n|D ]}| |D ]}t| qgq`t| |j
S )
z/Prune the tree by discarding unwanted elements.)tabletdthtrz.//figure[descendant::table]divc                 S   s   g | ]}|t vr|qS r'   )PRESERVE_IMG_CLEANINGr(   er'   r'   r+   
<listcomp><   s    z!tree_cleaning.<locals>.<listcomp>imgrecallz.//pN)r   copyr   tablesextendxpathtagimagesremover   focusfindr   iterr   
prune_html)r0   r1   cleaning_liststripping_listelemtcopy
expressionelementr'   r'   r+   tree_cleaning0   s.   



rO   balancedrE   c                 C   s2   |dk}|  dD ]}|jtv rt||d q	| S )zADelete selected empty elements to save space and processing time.	precisionz-.//processing-instruction()|.//*[not(node())])	keep_tail)rA   rB   r   r   )r0   rE   tailsrN   r'   r'   r+   rH   S   s   
rH   Fnodelistwith_backupc           	      C   s   |rt |  }t| }|D ]0}|| D ])}|jdur6| }|du r'| }|dur6|jp/dd |j |_| | qq|rQt |  }||d krO| S |S | S )z2Prune the HTML tree by removing unwanted sections.N     )lentext_contentr   tailgetprevious	getparentrD   )	r0   rT   rU   old_lenbackuprM   subtreeprevnew_lenr'   r'   r+   prune_unwanted_nodes]   s"   
rc   links_xpathc                 C   sL   dd dd | D D }t tt|}tdd |D }t|t|||fS )zCollect heuristics on link textc                 S   s   g | ]}|r|qS r'   r'   r9   r'   r'   r+   r;   }   s    z%collect_link_info.<locals>.<listcomp>c                 s   s    | ]	}t | V  qd S N)r   rZ   )r(   rK   r'   r'   r+   	<genexpr>}       z$collect_link_info.<locals>.<genexpr>c                 s   s    | ]	}|d k rdV  qdS )
   r   Nr'   )r(   lr'   r'   r+   rf      rg   )listmaprY   sum)rd   mylistlengths
shortelemsr'   r'   r+   collect_link_infoy   s   rp   rN   textfavor_precisionc                 C   s  |  d}|sdg fS g }t|dkr5|rdnd}t|d  }t||kr5t|t|d kr5dg fS | jd	krE|  d
u rBdnd}n|  d
u rNd}nd}t|}||k rt|\}	}
}}|
dkrhd|fS td|	|||
 |	|d ks|
dkr||
 dkrd|fS d|fS )z>Remove sections which are rich in links (probably boilerplate).//refFr   rh   d   r   g?TpN<      i,  u8   list link text/total: %s/%s – short elems/total: %s/%s皙?)	findallrY   r   rZ   rB   getnextrp   LOGGERdebug)rN   rq   rr   rd   rm   len_threshold	link_textlimitlenelemlenlinklenelemnumro   r'   r'   r+   link_density_test   s:   
 
 r   c                 C   sx   |  d}|s	dS tt|  }|dk rdS t|\}}}}|dkr%dS td|| |dk r6|d| kS |d	| kS )
z=Remove tables which are rich in links (probably boilerplate).rs   F   r   Tztable link text: %s / total: %si  rx   g      ?)ry   rY   r   rZ   rp   r{   r|   )rN   rd   r   r   r   _r'   r'   r+   link_density_test_tables   s   
 r   r`   tagnamebacktrackingc                 C   s   g }|rdnd}|rdnd}|  |D ]/}t| }t|||\}	}
|	s=|rB|
rBdt|  k r5|k rBn qt||krB|| qt|D ]}t| qH| S )z{Determine the link density of elements with respect to their length,
    and remove the elements identified as boilerplate.r   rt   r      r   )	rG   r   rZ   r   rY   appenddictfromkeysr   )r`   r   r   rr   	deletionsr}   depth_thresholdrK   elemtextresulttemplistr'   r'   r+   delete_by_link_density   s$   

r   TrK   comments_fixpreserve_spacesc                 C   s   | j dkrt| r| S | j dkst| dkr| js| jsdS |s1| j dkr1|s/t| jp-d| _| S | jsLt| dkrL| jd| _| _|rL| j dkrLd| _ |sat| jpTd| _| jrat| jp_d| _| jsht| sp|jrrt| |rrdS | S )z3Convert, format, and probe potential text elements.graphicdoner   NlbrV   ru   )	rB   r   rY   rq   r[   r   r   dedupr   )rK   r1   r   r   r'   r'   r+   handle_textnode   s4   "r   c                 C   s   | j dkst| dkr| js| jsdS t| jpdt| jpd| _| _| j dkr7| js7| jr7| jd| _| _| js=| jrKt| sI|jrKt| |rKdS | S )zBConvert, format, and probe potential text elements (light format).r   r   Nr   )rB   rY   rq   r[   r   r   r   r   )rK   r1   r'   r'   r+   process_node  s   ""r   c                 C   sn   |  d| j d| _d}| dddD ]!}|jdv r1| dt|j d|  |jdkr1|d7 }d	|_qd
S )zGConvert <ul> and <ol> to <list> and underlying <li> elements to <item>.rendrj   r   dddtli)r   r   -itemN)setrB   rG   str)rK   r   subelemr'   r'   r+   convert_lists   s   

r   c                 C   sj   d}| j dkr)t| dkr| d j dkrd}| d}|r)d}|D ]}|j  q!|r0d| _ d
S d	| _ d
S )z?Convert quoted elements while accounting for nested structures.Fprer   r   spanTz#.//span[starts-with(@class,'hljs')]codequoteN)rB   rY   rA   attribclear)rK   	code_flag
code_elemsr   r'   r'   r+   convert_quotes0  s   

r   c                 C   s"   | j   | d| j d| _dS )z$Add head tags and delete attributes.r   headN)r   r   r   rB   rK   r'   r'   r+   convert_headingsA  s   

r   c                 C   s
   d| _ dS )zConvert <br> and <hr> to <lb>r   N)rB   r   r'   r'   r+   convert_line_breaksH  s   
r   c                 C   s   d| _ | dd dS )z7Convert <del>, <s>, <strike> to <del rend="overstrike">delr   
overstrikeN)rB   r   r   r'   r'   r+   convert_deletionsM  s   r   c                 C   s    d| _ | dD ]}d|_ qdS )zHandle details and summary.r7   summaryr   N)rB   rG   )rK   r   r'   r'   r+   convert_detailsS  s   r   dlolulh1h2h3h4h5h6brhr
blockquoter   qr   sstrikedetailsbase_urlc                 C   s@   d| _ | d}| j  |r|rt||}| d| dS dS )z7Replace link tags and href attributes, delete the rest.refhreftargetN)rB   getr   r   r   r   )rK   r   r   r'   r'   r+   convert_linkq  s   


r   urlc                 C   s   |j sd}|jr|d7 }| |D ]}d|_qt| d n|o"t|}| ddD ]}t|| q)|jrP| t	
 D ]}|j  |dt	|j  d|_q;n
t| gt	
 R   | t
 D ]	}t|j | qa|jry| dD ]}d|_qs| S )	zBSimplify markup and convert relevant HTML tags to an XML standard.z).//*[self::div or self::li or self::p]//az|.//table//ar   ar   hir<   r   )linksr?   rA   rB   r   r   rG   r   
formattingREND_TAG_MAPPINGkeysr   r   r   CONVERSIONSrC   )r0   r1   r   
xpath_exprrK   r   r'   r'   r+   convert_tags}  s.   
r   r   c                 C   s   dt | dddd   S )Nhr   r   r   )intr   r   r'   r'   r+   <lambda>  s    r   r   r   c                 C   s   t | dd S )Nr   r   )HTML_TAG_MAPPINGr   r   r'   r'   r+   r     s    )	rj   r   r   r   r   r   r<   r   r   c                 C   s   |  t D ],}tt|j }t|r|||_n||_|jdkr.|d|jdd q|j	  qd| _t
d}||  |S )zConvert XML to simplified HTML.r   r   r   rV   bodyhtml)rG   HTML_CONVERSIONSr   r   rB   callabler   r   popr   r	   r   )r0   rK   
conversionrootr'   r'   r+   convert_to_html  s   

r   documentwith_metadatac                 C   s\   t | j}|r%td}tD ]}t| | }rt|d||d q|d| t|ddd S )z1Convert the document to HTML and return a string.r   meta)namecontentr   Tunicode)pretty_printencoding)	r   r   r	   r   getattrr
   insertr   strip)r   r   	html_treer   r   valuer'   r'   r+   build_html_output  s   
r   )rP   )F)FF)TFre   )B__doc__loggingr>   r   typingr   r   r   courlan.urlutilsr   r   
lxml.etreer   r	   r
   r   r   r   	lxml.htmlr   deduplicationr   settingsr   r   r   r   r   utilsr   r   r   xmlr   r   	getLogger__name__r{   r   itemsr   r8   rO   r   rH   boolrc   r   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r'   r'   r'   r+   <module>   s&   
#


(
"
.	

(