o
    DhD                     @   st  d Z zddlZdZW n ey   dZY nw ddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC eDeEZFeGd ejHejI ZJeKdZLeKdZMeKdZNg dZOddddd ZPd!ed"eeQ fd#d$ZRd%eQd"eeQ fd&d'ZSd!ed"e"fd(d)ZTd*eQd"eUfd+d,ZVd-eQd.eWd"eQfd/d0ZXd1eQd2eQd"e eQeQf fd3d4ZYd5eQd"eQfd6d7ZZ	8	dfd!ed9eQd5eQd:eWd;eeQ d"e eQeQf fd<d=Z[dgd>eQd!ed:eWd"eQfd?d@Z\	A	8	dhdBeeQ d!ed9eQd:eWd;eeQ d"dfdCdDZ]dEeQd"eeQddf fdFdGZ^	dfd%eQd!ed:eWdHee8 d"df
dIdJZ_d>eQd!ed:eWdHee8 d"eWf
dKdLZ`dMe"d!ed:eWdHe8d"e eeQ eWf f
dNdOZad!ed"eWfdPdQZbdMe"dReeQ d!ed"e"fdSdTZc	U		did!edVeWdMee" dHee8 d"df
dWdXZdd!ed"dfdYdZZed[eeQ d\eWd"eWfd]d^Zfd!edMe"d"eWfd_d`Zgd!ed"dfdadbZh		djd>ee d!edceeQ dHee8 d"eeQ f
dddeZidS )kz1
Functions dedicated to command-line processing.
    NTF)urlsafe_b64encode)ProcessPoolExecutorThreadPoolExecutoras_completed)datetime)partial)makedirspathstatwalk)RLock)Any	GeneratorOptionalListSetTuple)UrlStoreextract_domainget_base_url)spider   )html2txt)extract)generate_bow_hash)Responseadd_to_compressed_dictbuffered_downloadsbuffered_response_downloadsload_download_buffer)find_feed_urls)reset_caches)	ExtractorFILENAME_LENMAX_FILES_PER_DIRECTORYargs_to_extractor)sitemap_search)LANGID_FLAGURL_BLACKLIST_REGEXis_acceptable_lengthlanguage_classifiermake_chunksiY  z[^/]+$z\.[a-z]{2,5}$z<[^<]+?>)URLcrawlexploreprobefeedsitemapz.csvz.jsonz.xml)csvjsonxmlxmlteiargsreturnc                 C   s   g }| j r8z$t| j ddd}|dd |D  W d   n1 s#w   Y  W n  ty7   td Y nw tD ]}t| |rIt| |g} nq:|sQt	d t
t|S )	zGRead list of URLs to process or derive one from command-line arguments.rutf-8modeencodingc                 s       | ]}|  V  qd S N)strip.0line rC   V/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/cli_utils.py	<genexpr>V       z"load_input_urls.<locals>.<genexpr>Nz+ERROR: system, file type or buffer encodingzNo input provided)
input_fileopenextendUnicodeDecodeErrorsysexitINPUT_URLS_ARGSgetattrLOGGERwarninglistdictfromkeys)r6   
input_urls	inputfileargrC   rC   rD   load_input_urlsN   s$   

rW   filenamec                 C   sB   t | ddd}dd |D }W d   |S 1 sw   Y  |S )zRead list of unwanted URLs.r8   r9   )r<   c                 S   s   h | ]
}t d | qS ) )r(   subr?   r@   rC   rC   rD   	<setcomp>j       z!load_blacklist.<locals>.<setcomp>N)rH   )rX   inputfh	blacklistrC   rC   rD   load_blacklistf   s   
r_   c                 C   s*   t | }t|| j| jo| j | j| jdS )zGRead input list of URLs to process and build a domain-aware dictionary.)r^   compression
url_filterverbose)rW   r   r^   r1   rQ   ra   rb   )r6   	inputlistrC   rC   rD   load_input_dictn   s   rd   	directoryc                 C   sT   t | r
t | s(z	t| dd W dS  ty'   tjd|  d  Y dS w dS )z;Check if the output directory is within reach and writable.T)exist_okz0ERROR: Destination directory cannot be created: 
F)r	   existsisdirr   OSErrorrK   stderrwrite)re   rC   rC   rD   check_outputdir_status{   s   

	rm   dirnamecc                 C   s,   |dkrt t|t d nd}t| |S )z7Return a destination directory based on a file counter.r   r   rY   )strintr$   r	   join)rn   ro   c_dirrC   rC   rD   determine_counter_dir   s    rt   destdir	extensionc                 C   sX   d}|du st |r(ddd ttD }t | || }|du st |s||fS )zCFind a writable path and return it along with its random file name.NrY   c                 s   s    | ]}t tV  qd S r>   )randomchoice
CHAR_CLASS)rA   _rC   rC   rD   rE      s    z$get_writable_path.<locals>.<genexpr>)r	   rh   rr   ranger#   )ru   rv   output_pathrX   rC   rC   rD   get_writable_path   s   r}   contentc                 C   s   t ttd| d S )zaCreate a filename-safe string by hashing the given content
    after deleting potential XML tags.rY      )r   r   	CLEAN_XMLrZ   decode)r~   rC   rC   rD   generate_hash_filename   s   r   orig_filenamecounternew_filenamec           
      C   sl   t | jd}| jrtd|}t| j|}t	d|}nt
| j|}|p)t|}t||| }	|	|fS )zPPick a directory based on selected options and a file name based on output type.z.txtrY   )EXTENSION_MAPPINGgetoutput_format	keep_dirs	STRIP_DIRrZ   r	   rr   
output_dirSTRIP_EXTENSIONrt   r   )
r6   r   r~   r   r   rv   original_dirdestination_dirrX   r|   rC   rC   rD   determine_output_path   s   	r   
htmlstringc                 C   sl   t |j|}t|d\}}t|du r4tr4t|d}|| d W d   |S 1 s/w   Y  |S )z-Write a copy of raw HTML in backup directory.z.html.gzTwbr9   N)	rt   
backup_dirr}   rm   HAS_GZIPgziprH   rl   encode)r   r6   r   destination_directoryr|   rX   
outputfilerC   rC   rD   archive_html   s   
r   rY   resultc                 C   s   | du rdS |j du rtj| d  dS t||| ||\}}t|du rDt|ddd}||  W d   dS 1 s=w   Y  dS dS )z-Deal with result (write to STDOUT or to file)Nrg   Twr9   r:   )r   rK   stdoutrl   r   rm   rH   )r   r6   r   r   r   destination_pathr   r   rC   rC   rD   write_result   s   

"r   inputdirc                 c   s2    t | D ]\}}}|D ]	}t||V  qqdS )z2Walk the directory tree and output all file names.N)r   r	   rr   )r   rootrz   
inputfilesfnamerC   rC   rD   generate_filelist   s   r   optionsc           	      C   s   |st |}| |_t| d}| }W d   n1 sw   Y  t| }t|j|j}t	|
d|jd< t|||d}t||| |dd dS )z1Aggregated functions to process a file in a list.rbNz%Y-%m-%dmax_dater   )r   )r%   sourcerH   readr
   minst_ctimest_mtimer   fromtimestampstrftimedate_paramsexaminer   )	rX   r6   r   r   inputfr   	file_statref_timestampr   rC   rC   rD   file_processing   s   


r   c                 C   sN   |j r	t| ||nd}t| ||d}t|||||d |dkr%|r%|d7 }|S )zVExtract text and metadata from a download webpage and eventually write out the result.rY   r   )r   r   r   r   r   )r   r   r   r   )r   r6   r   r   fileslugr   rC   rC   rD   process_result   s   
r   	url_storec           	      C   s   g }|j dd}| js@t| |\}} t||j|dD ]!\}}|r1t|tr1||_t	||||}qt
d| || q| jr||fS )z?Implement a download queue consumer, single- or multi-threaded.DEFAULT
SLEEP_TIMEr   zNo result for URL: %s)configgetfloatdoner   r   parallel
isinstancerp   urlr   rO   rP   append)	r   r6   r   r   errors
sleep_time
bufferlistr   r   rC   rC   rD   download_queue_processing  s   r   c           	   
      sD  t | }| }| jr|  t| }t| jrtnt| j	|j
dd|j
dddt }t| jdO  fdd|D }t|D ]8}| dur{||  | jr{t| | jkr{| |  |  t  W d   n1 svw   Y  qCW d   n1 sw   Y  t| |}| jrt||| }t| ||d	 |S )
z/Group CLI functions dedicated to URL discovery.r   EXTERNAL_URLSr   )target_langexternalr   max_workersc                 3   s    | ]	}  |V  qd S r>   )submit)rA   r   executorfuncrC   rD   rE   5  s    z cli_discovery.<locals>.<genexpr>N)r   r   )rd   	dump_urlsrQ   resetr%   r   r0   r    r&   target_languager   
getbooleanr   r   r   r   r   r   add_urlslenget_known_domainsprint_unvisited_urlsr!   url_processing_pipeliner.   build_exploration_dictcli_crawler)	r6   r   rT   r   lockfuturesfuture	exit_codecontrol_dictrC   r   rD   cli_discovery#  s>   
r   rT   c                    sL   dd |D }|dd |   D    fdd|D }t||j|j|jdS )zMFind domains for which nothing has been found and add info to the crawl dict.c                 S      h | ]}t |qS rC   r   rA   urC   rC   rD   r[   R      z)build_exploration_dict.<locals>.<setcomp>c                 S   r   rC   r   r   rC   rC   rD   r[   S  s    c                    s   g | ]
}t | v r|qS rC   r   r   still_to_crawlrC   rD   
<listcomp>V  r\   z*build_exploration_dict.<locals>.<listcomp>)r^   ra   rb   )r   r   r^   ra   rb   )r   rT   r6   input_domainsnew_input_urlsrC   r   rD   r   N  s   r      nc                    s"  |pt | }|jdd}i }|du rtjt|  n|t_tj D ]}tjj| j	rAtjj
|dd}|rAtj|| jd||< q$tjjsttj|\}t_t|| j|dD ]\}	}
|
rlt|
trlt|
|t|	  qWt fdd	tj D r|ntjjrFtd
dd	 tj D  dS )z~Start a focused crawler which downloads a fixed number of URLs within a website
    and prints the links found in the process.r   r   NF)
as_visited)langr   c                 3   s    | ]}| kV  qd S r>   rC   rA   ro   r   rC   rD   rE     rF   zcli_crawler.<locals>.<genexpr>rg   c                 s   s    | ]}|V  qd S r>   rC   r   rC   rC   rD   rE     s    )r%   r   r   r   	URL_STOREr   rW   r   urldicttuplesget_url
init_crawlr   r   r   r   r   r   r   process_responser   anyget_all_countsprintrr   r   )r6   r   r   r   r   
param_dicthostname	startpager   r   r   rC   r   rD   r   _  s8   
	
"r   c                 C   s   t | }t| }t|| j|dD ]1\}}|durAt|}|rAt||jkrAtdd |D rAtr;| j	r;t
|d| j	krAt|dd qdS )zBProbe websites for extractable content and print the fitting ones.r   Nc                 s   r=   r>   )isalphar   rC   rC   rD   rE     rF   z!probe_homepage.<locals>.<genexpr>rY   T)flush)rW   r%   r   r   r   r   min_extracted_sizer   r'   r   r*   r   )r6   rT   r   r   r   rC   rC   rD   probe_homepage  s(   r   r   totalc                 C   s0   |dkr
t | | nd}|dkrdS | rdS dS )zvCompute exit code based on the number of errors:
    0 if there are no errors, 126 if there are too many, 1 otherwise.r   gGz?~   r   )r   )r   r   ratiorC   rC   rD   _define_exit_code  s   r  c                 C   s   | j r	|  dS t| }| }|tkrdnd}t|| ||\}}tdt|| | j	du rat
 }|dd |D  t|ddkrat|| ||\}}td	t|t| t|| S t||S )
zKAggregated functions to show a list and download and process an input list.Fr   r   z%s / %s URLs could not be foundTc                 S   s   g | ]}d | qS )zhttps://web.archive.org/web/20/rC   )rA   erC   rC   rD   r     r   z+url_processing_pipeline.<locals>.<listcomp>zhttps://web.archive.orgz-%s archived URLs out of %s could not be found)rQ   r   r%   total_url_numberr$   r   rO   debugr   archivedr   r   find_known_urlsr  )r6   r   r   	url_countr   r   archived_errorsrz   rC   rC   rD   r     s,   

r   c                 C   s   d}t | }|jdd}t| jd;}tt| jtD ])}|dk r*t	|tkr*d}t
t| ||d}|j||d|d |dkrE|t	|7 }qW d	   d	S 1 sQw   Y  d	S )
zGDefine batches for parallel file processing and perform the extraction.r   r   EXTRACTION_TIMEOUTr   r   )r6   r   r   
   )	chunksizetimeoutN)r%   r   getintr   r   r+   r   	input_dirr$   r   r   r   map)r6   filecounterr   r  r   	filebatchworkerrC   rC   rD   file_processing_pipeline  s$   
"r  r   c              
   C   s   d}|s	t ||}| du rtjd |S tt| |s$tjd |S z	t| |d}W |S  tyQ } ztjdt| dt	
  d W Y d}~|S d}~ww )z;Generic safeguards and triggers around extraction function.NzERROR: empty document
zERROR: file size
r   zERROR: rg   )r%   rK   rk   rl   r)   r   r   	Exceptionrp   	traceback
format_exc)r   r6   r   r   r   errrC   rC   rD   r     s"   

,r   )r   N)r   )rY   r   N)r   NN)NN)j__doc__r   r   ImportErrorloggingrw   restringrK   r  base64r   concurrent.futuresr   r   r   r   	functoolsr   osr   r	   r
   r   	threadingr   typingr   r   r   r   r   r   courlanr   r   r   trafilaturar   baseliner   corer   deduplicationr   	downloadsr   r   r   r   r   feedsr    metar!   settingsr"   r#   r$   r%   sitemapsr&   utilsr'   r(   r)   r*   r+   	getLogger__name__rO   seedascii_lettersdigitsry   compiler   r   r   rM   r   rp   rW   r_   rd   boolrm   rq   rt   r}   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r  r   rC   rC   rC   rD   <module>   s<    
	











+

0 