o
    Dhs                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ dd
lmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ e,e-Z.ddhZ/ddhZ0h dZ1h dZ2ddhZ3ddhZ4de5dedee
e6e5f  ddfddZ7dededee fddZ8dededee fd d!Z9d"ed#ed$eddfd%d&Z:d'ed"ededdfd(d)Z;d*ed+eddfd,d-Z<d*ede=fd.d/Z>d0ed1eddfd2d3Z?dededee fd4d5Z@dede=fd6d7ZAdedefd8d9ZBdededee fd:d;ZCded<e	e5 dedee fd=d>ZDded<e	e5 dedee fd?d@ZEdAe=defdBdCZFdDed<e	e5 dedee fdEdFZGdee dee fdGdHZHded<e	e5 dedee fdIdJZIefdKedLeded<edef
dMdNZJdKed<e	e5 dedefdOdPZKdKededeee5e	e5 f fdQdRZLdSededeee5eMf fdTdUZNd*ed<e	e5 dedee fdVdWZOdKededeee5eMef fdXdYZPdS )Zz6
Functions related to the main Trafilatura extractor.
    N)deepcopy)AnyOptionalTupleSetUnion)_ElementElement
SubElementstrip_elements
strip_tagstostring)HtmlElement   )delete_by_link_densityhandle_textnodelink_density_test_tablesprocess_nodeprune_unwanted_nodes)TAG_CATALOG	Extractor)FORMATTING_PROTECTEDis_image_filetext_chars_testtrim)delete_element)
BODY_XPATHCOMMENTS_DISCARD_XPATHCOMMENTS_XPATHDISCARD_IMAGE_ELEMENTSOVERALL_DISCARD_XPATHPRECISION_DISCARD_XPATHTEASER_DISCARD_XPATHhireftdth>   r#   r%   r&   >   r#   r$   spancodequoteheadmsgtagtextreturnc                 C   s    t d| |t|p	dpd dS )z/Format extraction event for debugging purposes.z	%s: %s %s NoneN)LOGGERdebugr   )r+   r,   r-    r3   [/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/main_extractor.py
_log_event&   s    r5   elementoptionsc                 C   sz   t | dkrt| |}nt| }t| D ]}t||dd}|dur&|| d|_q|dur;td|	 du r;|S dS )zProcess head elements (titles)r   Fcomments_fixNdoner/   T)
lenr   r   listr   appendr,   r   joinitertext)r6   r7   titlechildprocessed_childr3   r3   r4   handle_titles+   s   
rC   c                 C   s`   t | |}|du rdS |  }|du r|  }|du s |jtvr,td}|d| |S |}|S )z[Process formatting elements (b, i, etc. converted to hi) found
       outside of paragraphsNpr   )r   	getparentgetpreviousr,   r   r	   insert)r6   r7   
formattingparentprocessed_elementr3   r3   r4   handle_formattingE   s   
"rK   new_child_elemsubelemprocessed_subchildc                 C   s@   t | |j}|j|j|_|_|jD ]}|||j|  qdS )z/Add a sub-element to an existing child element.N)r
   r,   r-   tailattribset)rL   rM   rN   sub_child_elemattrr3   r3   r4   add_sub_elementw   s
   
rT   rA   c                 C   sl   | j |_ | dD ]*}|jdkrt||}|dur|| nt||dd}|dur0t||| d|_q	dS )z<Iterate through an element child and rewire its descendants.*r<   NFr8   r:   )r-   iterdescendantsr,   handle_listsr=   r   rT   )rA   rL   r7   rM   rN   r3   r3   r4   process_nested_elements   s   


rX   elemnew_elemc                 C   s"   |  d }r|d| dS dS )z>Copy the rend attribute from an existing element to a new one.rendN)getrQ   )rY   rZ   	rend_attrr3   r3   r4   update_elem_rendition   s   r^   c                 C   s   | duot d|  du S )z"Find if the element contains text.Nr/   T)r   r>   r?   )rY   r3   r3   r4   is_text_element   s   r_   processed_elem	orig_elemc                 C   s.   | durt || j}| j| j|_|_dS dS )z&Create a new sub-element if necessary.N)r
   r,   r-   rO   )r`   ra   	childelemr3   r3   r4   define_newelem   s   rc   c                 C   sZ  t | j}| jdur| j rt|d}| j|_| dD ]}t d}t|dkrPt||}|durO|jp6d|_|jrJ|j rJ| jd|j 7  _|	| n6t
||| |jdur|j rdd |D }|r|d }|jdu sw|j s||j|_n
| jd|j 7  _|jst|dkrt|| |	| d	|_qd	| _t|rt| | |S dS )
z3Process lists elements including their descendants.Nitemr   r/    c                 S   s   g | ]	}|j d kr|qS )r:   r,   .0elr3   r3   r4   
<listcomp>   s    z handle_lists.<locals>.<listcomp>r:   )r	   r,   r-   stripr
   rV   r;   r   rO   r=   rX   r^   r_   )r6   r7   rJ   rL   rA   rB   new_child_elem_childrenlast_subchildr3   r3   r4   rW      s>   







rW   c                 C   sb   |  ds
| jdkrdS |  }|durd| ddv rdS | d}|dur/t| dkr/dS d	S )
zECheck if it is a code element according to common structural markers.langr(   TN	highlightclassr/   r   F)r\   r,   rE   findr;   )r6   rI   r(   r3   r3   r4   is_code_block_element   s   
rs   c                 C   s(   t | }| dD ]}d|_q	d|_|S )z/Turn element into a properly tagged code block.rU   r:   r(   )r   iterr,   )r6   rJ   rA   r3   r3   r4   handle_code_blocks   s
   ru   c                 C   sf   t | rt| S t| j}| dD ]}t||}|dur"t|| d|_qt|r1t|d |S dS )zProcess quotes elements.rU   Nr:   r)   )	rs   ru   r	   r,   rt   r   rc   r_   r   )r6   r7   rJ   rA   rB   r3   r3   r4   handle_quotes   s   



rv   potential_tagsc                 C   s   | j dkrd| ddv rt| S | j |vr%| j dkr#td| j | j dS | j dkrLt| |dd	d
}|durLt|jd	u rL|j  |j dkrJd|_ |S dS )zAHandle diverse or unknown elements in the scope of relevant tags.divzw3-coderq   r/   r:   zdiscarding elementNFTr9   preserve_spacesrD   )	r,   r\   ru   r5   r-   r   r   rP   clear)r6   rw   r7   rJ   r3   r3   r4   handle_other_elements   s   




r|   c           
      C   s  | j   t| dkrt| |S t| j}| dD ]}|j|vr/|jdkr/td|j|j qt	||ddd}|dur|jd	kr_td
d	|j |jrW| jd|jpRd 7  _n|j|_d|_qt|j}|jt
v rt|dkr|D ]}t|jdu rd|j |_t||j qq|jdkr|d|dd n|jdkr|ddur|d|dd |j|j|_|_|jdkrt|}|dur|}|| d|_qt|dkr|d }	|	jdkr|	jdu rt|	 |S |jr|S tdd	t| dS )zIProcess paragraphs along with their children, trim and clean the content.r   rU   r:   zunexpected in pFTry   NrD   z
extra in pre   r/   r#   r[   r$   targetgraphicrk   lbzdiscarding element:)rP   r{   r;   r   r	   r,   rt   r5   r-   r   P_FORMATTINGr   r   rQ   r\   rO   handle_imager=   r   r   )
r6   rw   r7   rJ   rA   rB   newsubrd   
image_elem	last_elemr3   r3   r4   handle_paragraphs  sZ   









r   	is_headerc                 C   s   t d}| r|dd |S )z1Determine cell element type and mint new element.cellroler*   )r	   rQ   )r   cell_elementr3   r3   r4   define_cell_typeb  s   r   
table_elemc                 C   sL  t d}t| ddd d}| dD ]}t|tdd |tD }qd	}d	}|d
kr0t|nd}t d}	|r>|	d| |  D ]}
|
j	dkret
|	dkrd||	 t d}	|r`|	d| |pc|}n|
j	tv r|
j	dkoq| }|pu|}t|}t
|
dkrt|
|}|dur|j|j|_|_nZ|
j|
j|_|_d|
_	|
 D ]I}|j	tv r|j	tv rd|_	t||ddd}n%|j	dkr|jdkrt||}|dur|| d}n
t||dg|}|durt|| d|_	q|jst
|dkr|	| n|
j	dkr nd|
_	qB|	jdd t
|	dkr||	 t
|dkr$|S dS )zProcess single table element.tabletheadtbodytfootr   trc                 s   s     | ]}t |d dV  qdS )colspanr   N)intr\   )rh   r%   r3   r3   r4   	<genexpr>u  s    zhandle_table.<locals>.<genexpr>Fr   r/   rowr'   r&   Nr:   r   T)rz   r9   r<   recallrx   )r	   r   rt   maxsumTABLE_ELEMSstrrQ   rV   r,   r;   r=   r   r   r-   rO   	TABLE_ALLr   focusrW   handle_textelemunionrc   rP   pop)r   rw   r7   newtablemax_colsr   seen_header_rowseen_header	span_attrnewrow
subelementr   rL   processed_cellrA   rN   r3   r3   r4   handle_tablek  st    










r   c                 C   s   | du rdS t | j}dD ]}| |d}t|r!|d|  nq| j D ]\}}|dr<t|r<|d|  nq'| d }rJ|d| | d }rW|d| |jr_|dsadS |dd}|dsw|dt	d	d
| |S )z5Process image elements and their relevant attributes.N)data-srcsrcr/   r   r   altr@   httpz^//zhttp://)
r	   r,   r\   r   rQ   rP   items
startswithresub)r6   rJ   rS   r   valuealt_attr
title_attrsrc_attrr3   r3   r4   r     s0   

r   c                 C   s  d}| j dkrt| |}|S | j tv rt| |}|S | j dkr&t| |}|S | j dkr3t| ||}|S | j dkrRt| jdu rPt| |}|durPt	d}|j|_
|S | j tv r^t| |}|S | j dkrod|v rot| ||}|S | j dkr~d|v r~t| }|S t| ||}|S )	z?Process text element and determine how to deal with its contentNr<   r*   rD   r   Tr   r~   )r,   rW   CODES_QUOTESrv   rC   r   r   rO   r   r	   r-   
FORMATTINGrK   r   r   r|   )r6   rw   r7   new_elementthis_elementr3   r3   r4   r     s>   











r   treeresult_bodyc                    s   t d d} jdkrddg |d7 }t|  }dvr)t|ddd	 nt|d	 ||}|td
d  fdd|D  |S )zLook for all previously unconsidered wild elements, including outside of the determined
       frame and throughout the document to recover potentially missing text partszRecovering wild text elementsz\.//blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table|.//div[contains(@class, 'w3-code')]r   rx   r   z|.//div|.//lb|.//listr$   ar'   c                 S      | d uS Nr3   xr3   r3   r4   <lambda>      z#recover_wild_text.<locals>.<lambda>c                 3       | ]	}t | V  qd S r   r   rh   er7   rw   r3   r4   r     s    z$recover_wild_text.<locals>.<genexpr>)	r1   r2   r   updateprune_unwanted_sectionsr   xpathextendfilter)r   r   r7   rw   search_exprsearch_treesubelemsr3   r   r4   recover_wild_text   s   




r   c                 C   s*  |j dk}t| tdd} d|vrt| t} |j dkr&t| t} |r&t| t} tdD ]}t| dd|d} t| d	d
|d} t| dd
|d} q*d|v sK|r_| dD ]}t	|du r^t
|d
d qP|rt| dkr| d jdkrt
| d d
d t| dkr| d jdksnt| dd
dd} t| dd
dd} | S )z1Rule-based deletion of targeted document sections	precisionT)with_backupr~   r      rx   )backtrackingfavor_precisionr<   FrD   r   	keep_tailr   rk   r*   r)   )r   r   r    r   r"   r!   ranger   rt   r   r   r;   r,   )r   rw   r7   r   _rY   r3   r3   r4   r     s0   




r   c           	         s  t t jdu rg d  jdu rd  jdu r$d td}tD ]}t	dd || D d }|d u r=q*t
| }t|dkrJq*|d	} jd
krWd}nd}|rgtd| j| k rld dvrut|d dvr~t|d tt |d}dd |D dhkr|g}|dd  fdd|D D  t|dkr|d jtv rt|d dd t|dkr|d jtv st|dkrttt|  nq*d|  }||fS )NT)r   r%   r&   r   r~   r$   bodyc                 s       | ]	}|d ur|V  qd S r   r3   rh   sr3   r3   r4   r   D      z_extract.<locals>.<genexpr>r   z//p//text()r   r      r/   rx   r'   .//*c                 S   s   h | ]}|j qS r3   rf   r   r3   r3   r4   	<setcomp>]  s    z_extract.<locals>.<setcomp>r   c                 S   s   g | ]}|d ur|qS r   r3   rg   r3   r3   r4   rj   `  s    z_extract.<locals>.<listcomp>c                 3   r   r   r   r   r   r3   r4   r   `  r   rk   Fr   re   )rQ   r   tablesr   imagesaddlinksr	   r   nextr   r;   r   r   r>   min_extracted_sizer   r1   r2   sortedr   r,   NOT_AT_THE_ENDr   r   r   r?   rl   )	r   r7   r   exprsubtreeptestfactorr   	temp_textr3   r   r4   _extract7  sN   










$
r   cleaned_treec                 C   st   t | }t| |\}}}t|dkst||jk r)t||||}d|  }t|d t	|d ||t|fS )zFind the main content of a page using a set of XPath expressions,
       then extract relevant elements, strip them of unwanted subparts and
       convert themr   re   r:   rx   )
r   r   r;   r   r   r>   r?   rl   r   r   )r   r7   backup_treer   r   rw   r3   r3   r4   extract_contentl  s   

r   c                 C   s2   | j |v rt| |dd}|dur|j  |S dS )z?Process comment node and determine how to deal with its contentTr8   N)r,   r   rP   r{   )rY   rw   r7   rJ   r3   r3   r4   process_comments_node  s   

r   c              	      s   t d}tttD ]G}tdd || D d}|du rq
t|t}t|ddd |t	dd	  fd
d|
dD  t|dkrQt| t|dd  nq
d|  }||t|| fS )z>Try to extract comments out of potential sections in the HTML.r   c                 s   r   r   r3   r   r3   r3   r4   r     r   z#extract_comments.<locals>.<genexpr>Nr   r$   r'   c                 S   r   r   r3   r   r3   r3   r4   r     r   z"extract_comments.<locals>.<lambda>c                 3   r   r   )r   r   r   r3   r4   r     r   r   r   Fr   re   )r	   rQ   r   r   r   r   r   r   r   r   r   r;   r1   r2   r   r>   r?   rl   )r   r7   comments_bodyr   r   temp_commentsr3   r   r4   extract_comments  s    
*
r   )Q__doc__loggingr   copyr   typingr   r   r   r   r   
lxml.etreer   r	   r
   r   r   r   	lxml.htmlr   htmlprocessingr   r   r   r   r   settingsr   r   utilsr   r   r   r   xmlr   xpathsr   r   r   r   r    r!   r"   	getLogger__name__r1   r   r   r   r   r   r   r   bytesr5   rC   rK   rT   rX   r^   boolr_   rc   rW   rs   ru   rv   r|   r   r   r   r   r   r   r   r   r   r   r   r   r3   r3   r3   r4   <module>   sV    $
&2)	""R"	R"%"$" 5"&