o
    DhD                     @   sh  d Z zddlZdZW n ey   dZY nw ddlZddlZzddlZdZW n ey1   dZY nw ddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlmZ zddlZdZW n eyg   dZY nw zddlZdZW n ey{   dZY nw zddlZdZW n ey   dZY nw zdd	lmZ W n ey   dZY nw dd
lmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' e(e)Z*ddhZ+e,dej-Z.e,dej-Z/e,dZ0e$ddddddZ1ej,dej2ej3B dZ4e,dZ5e,dZ6h dZ7ddhZ8dZ9e,dZ:ej,dej;dZ<de=de=fd d!Z>d"e=de?fd#d$Z@d%e=deeA fd&d'ZBdee=eAf deAfd(d)ZCd*eAde?fd+d,ZDd-eAd*eAdeAfd.d/ZEd0eAdee# fd1d2ZFd0edee# fd3d4ZGe	d5d6d7eAdeAfd8d9ZHd:eAdeAfd;d<ZIdmd:eAd>ed? deAfd@dAZJe	dBd6dndCeAdDe?dEe?deeA fdFdGZKdndHeAdDe?dEe?deeA fdIdJZLdKe!de!fdLdMZMe	dBd6d:eAdeAfdNdOZNdPe!de?fdQdRZOdSeeA de?fdTdUZPdVedWeQdefdXdYZRdZeQd[ede?fd\d]ZSdodKe#d^eAd_e?de?fd`daZTdbeAdceAdeeA fdddeZUdbeAdceAd^eAdfedee?ef f
dgdhZVdPe!de?fdidjZWd:eeA de?fdkdlZXdS )pzj
Module bundling functions related to HTML and text processing,
content filtering and language detection.
    NTF)	lru_cache)islice)AnyListLiteralOptionalTupleUnion	normalize)detect)
from_bytes)_Element)HtmlElement
HTMLParser
fromstring)HTTPResponseutf-8utf_8z^< ?! ?DOCTYPE.+?/ ?>z(<html.*?)\s*/>z(<!--.*?-->|<[^>]*>))collect_idsdefault_doctypeencodingremove_comments
remove_pisz(?<![p{P}>])\n)flagsz^https?://|/+$z3[^\s]+\.(avif|bmp|gif|hei[cf]|jpe?g|png|webp)(\b|$)>   phitdrefcellheaditemquotecodepre)zhttp-equiv="content-language"zproperty="og:locale"z
([a-z]{2})z\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|Linkedin|Mail|PDF|Pinterest|Pocket|Print|QQ|Reddit|Twitter|WeChat|WeiBo|Whatsapp|Xing|Mehr zum Thema:?|More on this.{,8}$)$filecontentreturnc                 C   s   t | ts| S tr&| dd dkr&zt| W S  ty%   td Y nw trF| dd dkrFzt	| W S  t	j
yE   td Y nw trYzt| W S  tjyX   Y nw trmzt| W S  tjyl   Y | S w | S )z
    Don't trust response headers and try to decompress a binary string
    with a cascade of installed packages. Use magic numbers when available.
    N   s   zinvalid GZ file   s   (/zinvalid ZSTD file)
isinstancebytesHAS_GZIPgzip
decompress	ExceptionLOGGERwarningHAS_ZSTD	zstandard	ZstdError
HAS_BROTLIbrotlierrorHAS_ZLIBzlib)r%    r9   R/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/utils.pyhandle_compressed_file^   s8   
r;   datac                 C   s&   z|  d W dS  ty   Y dS w )zLSimple heuristic to determine if a bytestring uses standard unicode encodingzUTF-8FT)decodeUnicodeDecodeError)r<   r9   r9   r:   isutf8   s   r?   bytesobjectc                 C   s   t | rdgS g }tdurt| d }|dur||  t| dk r)t| }nt| dd | dd  p:t| }t|dkrK|dd	 |D  d
d	 |D S )z="Read all input or first chunk and return a list of encodingsr   Nr   i'  i  ixr   c                 S   s   g | ]}|j qS r9   )r   ).0rr9   r9   r:   
<listcomp>   s    z#detect_encoding.<locals>.<listcomp>c                 S   s   g | ]}|t vr|qS r9   )UNICODE_ALIASES)rA   gr9   r9   r:   rC      s    )r?   cchardet_detectappendlowerlenr   extend)r@   guessescchardet_guessdetection_resultsr9   r9   r:   detect_encoding   s   
rN   c              
   C   sp   t | tr| S d}t| } t| D ]}z| |}W n ttfy-   td| d}Y qw  |p7t| dddS )zCheck if the bytestring could be GZip and eventually decompress it,
       guess bytestring encoding and try to decode to Unicode string.
       Resort to destructive conversion otherwise.Nzwrong encoding detected: %sr   replace)r   errors)	r)   strr;   rN   r=   LookupErrorr>   r/   r0   )r%   htmltextguessed_encodingr9   r9   r:   decode_file   s   
rU   	beginningc                 C   s   d| vS )zOAssess if the object is proper HTML (awith a corresponding tag or declaration).htmlr9   )rV   r9   r9   r:   is_dubious_html   s   rX   
htmlstringc                 C   s   d|v r|  d\}}}tjd|ddd | } tt|  D ]\}}d|v r8|dr8tjd| dd}  | S |d	kr? | S q | S )
z>Repair faulty HTML strings to make then palatable for libxml2.doctype
    )countz<htmlz/>z\1>   )	partitionDOCTYPE_TAGsub	enumerateiter
splitlinesendswithFAULTY_HTML)rY   rV   	firstline_restiliner9   r9   r:   repair_faulty_html   s   rm   
htmlobjectc              
   C   sR   d}zt | ddtd}W |S  ty( } ztd| W Y d}~|S d}~ww )z!Try to pass bytes to LXML parser.Nutf8surrogatepassparserzlxml parser bytestring %s)r   encodeHTML_PARSERr.   r/   r6   )rn   treeerrr9   r9   r:   fromstring_bytes   s   rw   c              
   C   s(  t | tr| S t | tst| dr| j} t | ttfs"tdt| d}t	| } | dd 
 }t|}t| |} d}zt| td}W n& tyR   t| }d}Y n tyi } ztd| W Y d}~nd}~ww |du stt|d	k rz|szt| }|dur|du rt|d
k rtdt| d}|S )zLoad object given as input and validate its type
    (accepted: lxml.html tree, trafilatura/urllib3 response, bytestring and string)
    r<   zincompatible input typeN2   Frq   Tzlxml parsing failed: %sr]   r_   z9parsed tree length: %s, wrong data type or not valid HTML)r)   r   r   hasattrr<   r*   rQ   	TypeErrortyperU   rH   rX   rm   r   rt   
ValueErrorrw   r.   r/   r6   rI   )rn   ru   rV   
check_flagfallback_parserv   r9   r9   r:   	load_html   s:   

r   i @  )maxsizecharc                 C   s   |   s|  r
| S dS )z3Return a character if it belongs to certain classesr\   )isprintableisspace)r   r9   r9   r:   return_printables_and_spaces
  s   r   stringc                 C   s   d tt| S )z6Prevent non-printable and XML invalid character errorsr\   )joinmapr   r   r9   r9   r:   remove_control_characters  s   r   NFCunicodeform)r   NFDNFKCNFKDc                 C   s
   t || S )z;Normalize the given string to the specified unicode format.r
   )r   r   r9   r9   r:   normalize_unicode  s   
r   i   rl   preserve_spacetrailing_spacec                 C   s   t | dddddd}|sDttd|}tttj|r&d}|S |rD| d	  r0dnd
}| d  r:dnd
}d
	|||g}|S )zmRemove HTML space entities, then discard incompatible unicode
       and invalid XML characters on line levelz&#13;z&#10;r[   z&nbsp;     Nr   r\   )
r   rO   trimLINES_TRIMMINGrb   allr   rQ   r   r   )rl   r   r   new_linespace_beforespace_afterr9   r9   r:   line_processing  s    r   textc                    sR   |rt |  dS zdtd fdd|  D ddW S  ty(   Y dS w )z<Convert text and discard incompatible and invalid charactersTr[   Nc                 3   s    | ]}t | V  qd S )N)r   )rA   lr   r9   r:   	<genexpr>6  s    zsanitize.<locals>.<genexpr>u   ␤r\   )r   r   filterre   rO   AttributeError)r   r   r   r9   r   r:   sanitize/  s   ,r   ru   c                 C   s   |   D ]Y}| }|dur|jnd}|jtv p|tv }|jtv p&|tv p&|}|jD ]}d|v rF|j| r@|ddd | jvrF|j| q*|j	rRt
|j	|||_	|jr]t
|j|||_q| S )z?Trims spaces, removes control characters and normalizes unicodeNr\   :r]   r   )rd   	getparenttagSPACING_PROTECTEDFORMATTING_PROTECTEDattribsplitnsmappopr   r   tail)ru   elemparent
parent_tagr   r   	attributer9   r9   r:   sanitize_tree;  s    
 r   c              	   C   s.   z
d |   W S  ttfy   Y dS w )z/Remove unnecessary spaces within a text string.r   r\   )r   r   stripr   rz   r   r9   r9   r:   r   S  s
   r   elementc                 C   sT   dD ]}|  |d}t|r dS q| j D ]\}}|dr't|r' dS qdS )z*Check if an element is a valid img element)data-srcsrcr\   Tr   F)getis_image_filer   items
startswith)r   attrr   valuer9   r9   r:   is_image_element]  s   r   imagesrcc                 C   s&   | du s
t | dkrdS tt| S )zCheck if the observed string corresponds to a valid image extension.
       Use a length threshold and apply a regex on the content.Ni    F)rI   boolIMAGE_EXTENSIONsearch)r   r9   r9   r:   r   k  s   r   iterablenc                 c   s<    t | }tt|| }r|V  tt|| }sdS dS )zChunk data into smaller pieces.N)rd   tupler   )r   r   iteratorbatchr9   r9   r:   make_chunkss  s
   r   my_lenoptionsc                 C   s>   | |j k rtd|j dS | |jkrtd| |j dS dS )z=Check if the document length is within acceptable boundaries.ztoo small/incorrect for URL %sFztoo large: length %s for URL %sT)min_file_sizer/   r6   urlmax_file_size)r   r   r9   r9   r:   is_acceptable_length{  s   

r   target_languagestrictc                    s   t D ]$}| d| d}|r&t fdd|D r dS td|  dS q|rD| d}|rDt fd	d|D r=dS td
 dS td dS )zrCheck HTML meta-elements for language information and split
       the result in case there are several languages.z	.//meta[@z][@content]c                 3   *    | ]} t |d d v V  qdS )contentr\   NRE_HTML_LANGr   r   rH   rA   r   r   r9   r:   r        ( z"check_html_lang.<locals>.<genexpr>Tz%s lang attr failedFz//html[@lang]c                 3   r   )langr\   Nr   r   r   r9   r:   r     r   zHTML lang failedzNo relevant lang elements found)TARGET_LANG_ATTRSfindallanyr/   debugxpath)ru   r   r   r   elemsr9   r   r:   check_html_lang  s"   


r   	temp_texttemp_commentsc                 C   sF   t du rt| t|krt| nt|\}}|S td d}|S )zARun external component (if installed) for language identificationTz3Language detector not installed, skipping detectionN)LANGID_FLAGrI   	py3langidclassifyr/   r0   )r   r   resultri   r9   r9   r:   language_classifier  s   

r   docmetac                 C   sJ   |dur!t | ||_|jdur!|j|kr!td|j|j d|fS d|fS )zFFilter text based on language detection and store relevant informationNzwrong language: %s %sTF)r   languager/   r0   r   )r   r   r   r   r9   r9   r:   language_filter  s   r   c                 C   s8   | j du r| jn| j }| p| ptttj| S )zFilter out unwanted textN)r   r   r   r   r   	RE_FILTERmatchre   )r   testtextr9   r9   r:   
textfilter  s   "r   c                 C   s   t | o|   S )zJDetermine if a string is only composed of spaces and/or control characters)r   r   r   r9   r9   r:   text_chars_test  s   r   )r   )FF)F)Y__doc__r,   r+   ImportErrorloggingrer8   r7   	functoolsr   	itertoolsr   typingr   r   r   r   r   r	   unicodedatar   r5   r4   r2   r1   r   r   cchardetr   rF   charset_normalizerr   
lxml.etreer   	lxml.htmlr   r   r   urllib3.responser   	getLogger__name__r/   rD   compileIra   rg   HTML_STRIP_TAGSrt   UNICODE	MULTILINEr   URL_BLACKLIST_REGEXr   r   r   r   r   
IGNORECASEr   r*   r;   r   r?   rQ   rN   rU   rX   rm   rw   r   r   r   r   r   r   r   r   r   r   intr   r   r   r   r   r   r   r9   r9   r9   r:   <module>   s    




%	
-" 	&