
    (#he(                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZmZmZ ddlmZmZ ej.                  j0                  d	k7  r/ eej.                  d
      rej.                  j5                  d       ej6                  j0                  d	k7  r/ eej6                  d
      rej6                  j5                  d       de	de	fdZde	de	fdZde	de	fdZddZde	ddfdZ e!dk(  r e        yy)z.
Implementing a basic command-line interface.
    N)version)python_version)Any   )	cli_crawlercli_discoveryexaminefile_processing_pipelineload_blacklistload_input_dictprobe_homepageurl_processing_pipelinewrite_result)PARALLEL_CORESSUPPORTED_FMT_CLIzUTF-8reconfigurezutf-8)encodingparserreturnc           
         | j                  dd      }|j                         }| j                  dd      }| j                  dd      }|j                         }| j                  dd      }| j                  d	d
      }|j                         }|j                  dddt               |j                  ddt               |j                  dddt               |j                  ddt        t
               |j                  dddt               |j                  ddd       |j                  ddd t               |j                  d!d"t               |j                  d#d$d       |j                  d%d&d'd(d)*       |j                  d+d,d'd(d)*       |j                  d-d.d'd(d)*       |j                  d/d0d'd(d)*       |j                  d1d2d'd(d)*       |j                  d3d4d       |j                  d5d6d7t        8       |j                  d9d:d;d       |j                  d<d=d       |j                  d>d?d       |j                  d@dAd       |j                  dBdCdD       |j                  dEdFdD       |j                  dGdHd       |j                  dIdJd       |j                  dKdLt               |j                  dMdNd       |j                  dOdPt               |j                  dQdRd       |j                  dSdTd       |j                  dUdVt        dWX       |j                  dYdZd       |j                  d[d\d       |j                  d]d^d       |j                  d_d`d       |j                  dadbd       |j                  dcddd       |j                  dedfd       | j                  dgdhdidjdkl       | j                  dmdndodpt        dq       drt                s       | S )tz,Add argument groups and arguments to parser.Inputz%URLs, files or directories to processOutputz+Determines if and how files will be written
NavigationzLink discovery and web crawling
Extractionz-Customization of text and metadata processingFormatzSelection of the output formatz-iz--input-filez'name of input file for batch processing)helptypez--input-dirz5read files from a specified directory (relative path)z-uz--URLzcustom URL downloadz
--parallelzAspecify a number of cores/threads for downloads and/or processing)r   r   defaultz-bz--blacklistz:file containing unwanted URLs to discard during processingz--listz/display a list of URLs without downloading them
store_true)r   actionz-oz--output-dirz6write results in a specified directory (relative path)z--backup-dirz9preserve a copy of downloaded files in a backup directoryz--keep-dirsz-keep input directory structure and file namesz--feedz.look for feeds and/or pass a feed URL as input?TF)r   nargsconstr   z	--sitemapzBlook for sitemaps for the given website and/or enter a sitemap URLz--crawlzJcrawl a fixed number of pages within a website starting from the given URLz	--explorez=explore the given websites (combination of sitemap and crawl)z--probez?probe for extractable content (works best with target language)z
--archivedz=try to fetch URLs from the Internet Archive if downloads failz--url-filterzLonly process/output URLs containing these patterns (space-separated strings)+)r   r"   r   z-fz--fastz!fast (without fallback detection)z--formattingz,include text formatting (bold, italic, etc.)z--linksz5include links along with their targets (experimental)z--imagesz.include image sources in output (experimental)z--no-commentszdon't output any commentsstore_falsez--no-tableszdon't output any table elementsz--only-with-metadataz4only output those documents with title, URL and datez--with-metadataz&extract and add metadata to the outputz--target-languagez*select a target language (ISO 639-1 codes)z--deduplicatez+filter out duplicate documents and sectionsz--config-filezAoverride standard extraction parameters with a custom config filez--precisionz;favor extraction precision (less noise, possibly less text)z--recallz8favor extraction recall (more text, possibly more noise)z--output-formatzdetermine output formattxt)r   choicesr   z--csvzshorthand for CSV outputz--htmlzshorthand for HTML outputz--jsonzshorthand for JSON outputz
--markdownzshorthand for MD outputz--xmlzshorthand for XML outputz--xmlteizshorthand for XML TEI outputz--validate-teizvalidate XML TEI outputz-vz	--verbosecountr   z&increase logging verbosity (-v or -vv))r    r   r   z	--versionz!show version information and exitr   zTrafilatura trafilaturaz
 - Python )r   r    r   )	add_argument_groupadd_mutually_exclusive_groupadd_argumentstrintr   r   r   r   )	r   group1	group1_exgroup2group3	group3_exgroup4group5	group5_exs	            L/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/cli.pyadd_argsr8      sE    &&w0WXF335I&&x1^_F&&|5VWF335I&&|5deF&&x1QRF335I4F   " =T   " 42   " ` .  : mY   " N+  - nU   " X   " L+  - 8M!u  > ;a!u  > 9i!u  > ;\!u  > 9^!u  > \+  - k!  - h@+  - K+  - 	T+  - 
M+  - 8,  . >,  . .S+  - )E+  - +I   " J+  - `   " Z+  - 
W+  -
 ,6 1 %  ' 77+  - 88+  - 88+  - <6+  - 77+  - :;+  - (6+  - k'1E   0w}56jAQ@RS	   M    argsc                 v    t        j                  d      }t        |      }t        |j	                               S )z(Define parser for command-line argumentsz&Command-line interface for Trafilatura)description)argparseArgumentParserr8   map_args
parse_args)r:   r   s     r7   r@   r@      s1    $$1YZFfFF%%'((r9   c                 @    dD ]  }t        | |      s|| _         | S  | S )z2Map existing options to format and output choices.)csvhtmljsonmarkdownxmlxmltei)getattroutput_format)r:   otypes     r7   r?   r?      s5     F 4!&DK	 Kr9   c                  R    t        t        j                  dd       } t        |        y)z  Run as a command-line utility. r   N)r@   sysargvprocess_args)r:   s    r7   mainrO      s    chhqrl#Dr9   c                 x   d}| j                   dk(  r4t        j                  t        j                  t        j
                         nB| j                   dk\  r3t        j                  t        j                  t        j                         | j                  rt        | j                        | _        | j                  s| j                  s| j                  rt        |       }n| j                  rt        |        n| j                  rt!        |        n| j"                  rt%        |        nz| j&                  s| j(                  rt+        |       }t-        | |      }nJt/        t        j0                  j2                  j5                         | | j(                        }t7        ||        |dk7  rt        j8                  |       yy)z8Perform the actual processing according to the argumentsr   r   )streamlevel   )urlN)verboseloggingbasicConfigrL   stdoutWARNINGDEBUG	blacklistr   explorefeedsitemapr   crawlr   prober   	input_dirr
   
input_fileURLr   r   r	   stdinbufferreadr   exit)r:   	exit_code	url_storeresults       r7   rN   rN      s(   I||q3::W__E		3::W]]C~~'7
 ||tyyDLL!$'	 
D 
t 
 & 
DHH#D)	+D)<	 ))..0$DHHEVT" A~ r9   __main__)r   N)"__doc__r=   rV   rL   importlib.metadatar   platformr   typingr   	cli_utilsr   r   r	   r
   r   r   r   r   r   settingsr   r   rX   r   hasattrr   stderrr8   r@   r?   rO   rN   __name__ r9   r7   <module>rv      s     
 & # ? ? ? 8 ::'!gcjj-&HJJG,::'!gcjj-&HJJG,PS PS Pf)S )S )3 3 *s *t *Z zF r9   