o
    Dhe(                     @   s0  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZmZmZ ddlmZmZ ejjd	krQeejd
rQejjdd ejjd	krdeejd
rdejjdd de	de	fddZde	de	fddZde	de	fddZdddZde	ddfddZ e!dkre  dS dS )z.
Implementing a basic command-line interface.
    N)version)python_version)Any   )	cli_crawlercli_discoveryexaminefile_processing_pipelineload_blacklistload_input_dictprobe_homepageurl_processing_pipelinewrite_result)PARALLEL_CORESSUPPORTED_FMT_CLIzUTF-8reconfigurezutf-8)encodingparserreturnc           	      C   s  |  dd}| }|  dd}|  dd}| }|  dd}|  d	d
}| }|jdddtd |jddtd |jdddtd |jddttd |jdddtd |jdddd |jddd td |jd!d"td |jd#d$dd |jd%d&d'd(d)d* |jd+d,d'd(d)d* |jd-d.d'd(d)d* |jd/d0d'd(d)d* |jd1d2d'd(d)d* |jd3d4dd |jd5d6d7td8 |jd9d:d;dd |jd<d=dd |jd>d?dd |jd@dAdd |jdBdCdDd |jdEdFdDd |jdGdHdd |jdIdJdd |jdKdLtd |jdMdNdd |jdOdPtd |jdQdRdd |jdSdTdd |jdUdVtdWdX |jdYdZdd |jd[d\dd |jd]d^dd |jd_d`dd |jdadbdd |jdcdddd |jdedfdd | jdgdhdidjdkdl | jdmdndodptdq drt  ds | S )tz,Add argument groups and arguments to parser.Inputz%URLs, files or directories to processOutputz+Determines if and how files will be written
NavigationzLink discovery and web crawling
Extractionz-Customization of text and metadata processingFormatzSelection of the output formatz-iz--input-filez'name of input file for batch processing)helptypez--input-dirz5read files from a specified directory (relative path)z-uz--URLzcustom URL downloadz
--parallelzAspecify a number of cores/threads for downloads and/or processing)r   r   defaultz-bz--blacklistz:file containing unwanted URLs to discard during processingz--listz/display a list of URLs without downloading them
store_true)r   actionz-oz--output-dirz6write results in a specified directory (relative path)z--backup-dirz9preserve a copy of downloaded files in a backup directoryz--keep-dirsz-keep input directory structure and file namesz--feedz.look for feeds and/or pass a feed URL as input?TF)r   nargsconstr   z	--sitemapzBlook for sitemaps for the given website and/or enter a sitemap URLz--crawlzJcrawl a fixed number of pages within a website starting from the given URLz	--explorez=explore the given websites (combination of sitemap and crawl)z--probez?probe for extractable content (works best with target language)z
--archivedz=try to fetch URLs from the Internet Archive if downloads failz--url-filterzLonly process/output URLs containing these patterns (space-separated strings)+)r   r    r   z-fz--fastz!fast (without fallback detection)z--formattingz,include text formatting (bold, italic, etc.)z--linksz5include links along with their targets (experimental)z--imagesz.include image sources in output (experimental)z--no-commentszdon't output any commentsstore_falsez--no-tableszdon't output any table elementsz--only-with-metadataz4only output those documents with title, URL and datez--with-metadataz&extract and add metadata to the outputz--target-languagez*select a target language (ISO 639-1 codes)z--deduplicatez+filter out duplicate documents and sectionsz--config-filezAoverride standard extraction parameters with a custom config filez--precisionz;favor extraction precision (less noise, possibly less text)z--recallz8favor extraction recall (more text, possibly more noise)z--output-formatzdetermine output formattxt)r   choicesr   z--csvzshorthand for CSV outputz--htmlzshorthand for HTML outputz--jsonzshorthand for JSON outputz
--markdownzshorthand for MD outputz--xmlzshorthand for XML outputz--xmlteizshorthand for XML TEI outputz--validate-teizvalidate XML TEI outputz-vz	--verbosecountr   z&increase logging verbosity (-v or -vv))r   r   r   z	--versionz!show version information and exitr   zTrafilatura trafilaturaz
 - Python )r   r   r   )	add_argument_groupadd_mutually_exclusive_groupadd_argumentstrintr   r   r   r   )	r   group1	group1_exgroup2group3	group3_exgroup4group5	group5_ex r5   P/home/air/segue/gemini/back/venv/lib/python3.10/site-packages/trafilatura/cli.pyadd_args   sN  r7   argsc                 C   s    t jdd}t|}t| S )z(Define parser for command-line argumentsz&Command-line interface for Trafilatura)description)argparseArgumentParserr7   map_args
parse_args)r8   r   r5   r5   r6   r=      s   r=   c                 C   s$   dD ]}t | |r|| _ | S q| S )z2Map existing options to format and output choices.)csvhtmljsonmarkdownxmlxmltei)getattroutput_format)r8   otyper5   r5   r6   r<      s   
r<   c                  C   s   t tjdd } t|  dS )z  Run as a command-line utility. r   N)r=   sysargvprocess_args)r8   r5   r5   r6   main   s   rJ   c                 C   s   d}| j dkrtjtjtjd n| j dkrtjtjtjd | jr(t| j| _| j	s1| j
s1| jr6t| }n9| jr>t|  n1| jrFt|  n)| jrNt|  n!| jsT| jr^t| }t| |}nttjj | | jd}t||  |dkrzt| dS dS )z8Perform the actual processing according to the argumentsr   r   )streamlevel   )urlN)verboseloggingbasicConfigrG   stdoutWARNINGDEBUG	blacklistr
   explorefeedsitemapr   crawlr   prober   	input_dirr	   
input_fileURLr   r   r   stdinbufferreadr   exit)r8   	exit_code	url_storeresultr5   r5   r6   rI      s.   






rI   __main__)r   N)"__doc__r:   rP   rG   importlib.metadatar   platformr   typingr   	cli_utilsr   r   r   r	   r
   r   r   r   r   settingsr   r   rR   r   hasattrr   stderrr7   r=   r<   rJ   rI   __name__r5   r5   r5   r6   <module>   s,    , 

-
