
    "#h81                         d dl mZ d dl mZmZmZ d dlZd dlZd dlZd dlZddl	m
Z
mZmZmZ ddl ddlmZmZ d Zej&                  d	fd
Zej&                  fdZej&                  fdZd Zedk(  r e        yy)    )absolute_import)divisionprint_functionunicode_literalsN   )escapePY3URLErrorurllib)*)get_stoplistget_stoplistsc                      dt         j                  j                  t         j                  j                  t        j                  d               t
        t        t        t        t        t        t        t        d	z  S )Na  Usage: %(progname)s -s STOPLIST [OPTIONS] [HTML_FILE]
Convert HTML to plain text and remove boilerplate.

  -o OUTPUT_FILE   if not specified, output is written to stdout
  --encoding=...   default character encoding to be used if not specified
                   in the HTML meta tags (default: %(default_encoding)s)
  --enc-force      force specified encoding, ignore HTML meta tags
  --enc-errors=... errors handling for character encoding conversion:
                     strict: fail on error
                     ignore: ignore characters which can't be converted
                     replace: replace characters which can't be converted
                              with U+FFFD unicode replacement characters
                   (default: %(default_enc_errors)s)
  --format=...     output format; possible values:
                     default: one paragraph per line, each preceded with
                              <p> or <h> (headings)
                     boilerplate: same as default, except for boilerplate
                                  paragraphs are included, too, preceded
                                  with <b>
                     detailed: one paragraph per line, each preceded with
                               <p> tag containing detailed information
                               about classification as attributes
                     krdwrd: KrdWrd compatible format
  --no-headings    disable special handling of headings
  --list-stoplists print a list of inbuilt stoplists and exit
  -V, --version    print version information and exit
  -h, --help       display this help and exit

If no HTML_FILE specified, input is read from stdin.

STOPLIST must be one of the following:
  - one of the inbuilt stoplists; see:
      %(progname)s --list-stoplists
  - path to a file with the most frequent words for given language,
    one per line, in UTF-8 encoding
  - None - this activates a language-independent mode

Advanced options:
  --length-low=INT (default %(length_low)i)
  --length-high=INT (default %(length_high)i)
  --stopwords-low=FLOAT (default %(stopwords_low)f)
  --stopwords-high=FLOAT (default %(stopwords_high)f)
  --max-link-density=FLOAT (default %(max_link_density)f)
  --max-heading-distance=INT (default %(max_heading_distance)i)
r   )	progname
length_lowlength_highstopwords_lowstopwords_highmax_link_densitymax_heading_distancedefault_encodingdefault_enc_errors)ospathbasenamesysargvLENGTH_LOW_DEFAULTLENGTH_HIGH_DEFAULTSTOPWORDS_LOW_DEFAULTSTOPWORDS_HIGH_DEFAULTMAX_LINK_DENSITY_DEFAULTMAX_HEADING_DISTANCE_DEFAULTDEFAULT_ENCODINGDEFAULT_ENC_ERRORS     M/var/www/html/sandstorm/venv/lib/python3.12/site-packages/justext/__main__.pyusager)      s[    ,Z   !1!1#((1+!>?$&*,08(,
Y6 6r'   Tc           
          | D ]P  }|j                   dk(  r|j                  rd}nd}n|r'd}t        d|dt        |j                  d      |	       R y
)z
    Outputs the paragraphs as:
    <tag> text of the first paragraph
    <tag> text of the second paragraph
    ...
    where <tag> is <p>, <h> or <b> which indicates
    standard paragraph, heading or boilerplate respecitvely.
    goodhpb<z> FquotefileN)
class_typeheadingprintr   text)
paragraphsfpno_boilerplate	paragraphtags        r(   output_defaultr=   J   s[       O	6)  C3y~~U CD2NOr'   c           
          | D ]_  }d|j                   |j                  t        |j                        |j                  t        |j                  d      fz  }t        ||       a y)z
    Same as output_default, but only <p> tags are used and the following
    attributes are added: class, cfclass and heading.
    z6<p class="%s" cfclass="%s" heading="%i" xpath="%s"> %sFr0   r2   N)r4   cf_classintr5   xpathr   r7   r6   )r8   r9   r;   outputs       r(   output_detailedrC   a   sd    
   	I  	!!"OO9>>/M
 
 	f2r'   c                     | D ]U  }|j                   dv r|j                  rd}nd}nd}|j                  D ]"  }t        d||j	                         fz  |       $ W y)a  
    Outputs the paragraphs in a KrdWrd compatible format:
    class<TAB>first text node
    class<TAB>second text node
    ...
    where class is 1, 2 or 3 which means
    boilerplate, undecided or good respectively. Headings are output as
    undecided.
    )r+   neargood      r   z%i	%sr2   N)r4   r5   
text_nodesr6   strip)r8   r9   r;   cls	text_nodes        r(   output_krdwrdrL   q   sn       
@	#77  C"-- 	@I(c9??#455B?	@
@r'   c                     dd l } ddlm} 	 | j                  t        j                  dd  dg d      \  }}t        j                  d      d   }t        j                  }t        r! |t        j                  j                        }n |t        j                        }d }d	}	d
}
t         }t"        }t$        }t&        }t(        }t*        }d }t,        }d
}t.        }	 D ]  \  }}|dv r(t        t                      t        j                  d       |dv rUt        t0        j2                  j5                  t        j                  d         d|d       t        j                  d       |dk(  rAt        dj7                  t9        t;                                  t        j                  d       |dk(  r	 t        j<                  |dd      }|dk(  r|jC                         dk(  rtE               }t0        j2                  jG                  |      rQ	 t        j<                  |dd      }tE        |D cg c]  }|jI                          c}      }|jK                          |t;               v rtO        |      }tQ        jR                  d|      r1tA        d|ddj7                  t9        t;                                 tA        d|z        |dk(  r	 |}djU                  |       |d!k(  rd"}|d#k(  r2|jC                         d$v r|jC                         }?tA        d%|z        |d&k(  r|d'v r|}	ZtA        d(|z        |d)k(  rd"}
q|d*k(  r	 tY        |      }|d.k(  r	 tY        |      }|d/k(  r	 t]        |      }|d1k(  r	 t]        |      }|d2k(  r	 t]        |      }|d3k(  s	 tY        |      } |r|}|tA        d4      |sd}d}rB	 tQ        jR                  d5|d         rt_        j`                  |d         }nt=        |d   d      }|je                         }|t        j                  ur|jK                          t        |||||||||
|||      }|	d	k(  rtg        ||       y |	d6k(  rtg        ||d
7       y |	d8k(  rti        ||       y |	d9k(  rtk        ||       y tm        d:|	z        # | j
                  $ r^}t        |t        j                         t        t               t        j                         t        j                  d       Y d }~d }~ww xY w# t>        $ r}tA        d|d|      d }~ww xY wc c}w # t>        $ r}tA        d|d|      d }~wtL        $ r}tA        d|z        d }~ww xY w# tV        $ r tA        d |z        w xY w# tZ        $ r tA        d+|d,|d-      w xY w# tZ        $ r tA        d+|d,|d-      w xY w# tZ        $ r tA        d+|d,|d0      w xY w# tZ        $ r tA        d+|d,|d0      w xY w# tZ        $ r tA        d+|d,|d0      w xY w# tZ        $ r tA        d+|d,|d-      w xY w# t>        tb        f$ r}tA        d|d   d|      d }~ww xY w# tn        $ rm}t        t0        j2                  j5                  t        j                  d         d;|t        j                         t        j                  d       Y d }~y d }~ww xY w)<Nr   )__version__r   zo:s:hV)z	encoding=z	enc-forcezenc-errors=zformat=zno-headingshelpversionzlength-low=zlength-high=zstopwords-low=zstopwords-high=zmax-link-density=zmax-heading-distance=zlist-stoplistsr2   utf8defaultF)z-hz--help)z-Vz	--versionz: jusText vz<

Copyright (c) 2011 Jan Pomikalek <jan.pomikalek@gmail.com>z--list-stoplists
z-owzCan't open z for writing: z-snonerz for reading: zLUnicode decoding error when reading the stoplist (probably not in UTF-8): %sz^\w*$zUnknown stoplist: z
Available stoplists:
zFile not found: %sz
--encoding zUknown character encoding: %sz--enc-forceTz--enc-errors)strictignorereplacezInvalid --enc-errors value: %sz--format)rS   boilerplatedetailedkrdwrdzUknown output format: %sz--no-headingsz--length-lowzInvalid value for z: 'z'. Integer expected.z--length-highz--stopwords-lowz'. Float expected.z--stopwords-highz--max-link-densityz--max-heading-distancezNo stoplist specified.z	[^:/]+://r\   )r:   r]   r^   zUnknown format: %sz: )8getoptjustextrN   r   r   GetoptErrorr6   stderrr)   exitcodecslookupstdinr	   stdoutbufferr   r   r    r!   r"   r#   r$   r%   r   r   r   joinsortedr   openIOErrorJustextInvalidOptionslowersetisfilerI   closeUnicodeDecodeErrorr   rematchencodeLookupErrorr@   
ValueErrorfloatr   urlopenr
   readr=   rC   rL   AssertionErrorJustextError)r_   VERSIONoptsargsestream_writerfp_infp_outstoplistformatno_headingsr   r   r   r   r   r   encodingr   force_default_encoding
enc_errorsoafp_stoplistl	html_textr8   s                              r(   mainr      s   .	]]388AB< <7 8
d MM&)"-MIIE
szz001szz*HFK#J%K)M+N/7H'"#JQ d	RDAq$$eg''GG$$SXXa[17< =((dii} 789dB#[[C8F d779&"uHww~~a(
P*0++af*EK'*{+K!AGGI+K'LH'--/ mo-#/?88Ha0 #8$%tyy1H'I!K#L L
 #88Lq8P"QQl"U'($II./ m#)-&n$779 ??!"J/0PST0TUUjHHF/0JQ0NOOo%"n$R!$QJ o%R"%a&K ''P$)!HM ((P%*1XN **P',Qx$ ..R+.q6(Cd	RL "'H'(@AANMD88L$q'2"NN473E a#.E
 JJL			!KKMY*k>+;=Q#3ZA
 Y:v.}$:veDz!J/x*f- !!5!>??M  acjj!egCJJ'P  B/;<a@B BB ,L& J"7CDa H#J J1 P"7!KMN!O#P PP( # U/0ORS0STTU& " R/KLaPR RR " R/KLaPR RR " P/IJANP PP " P/IJANP PP " P/IJANP PP " R/KLaPR RR* X& D+7;AwBD DD.  "''**388A;7;#**Ms  )R8 CZ /T(AZ  U+U

UA5Z V"A+Z V&Z !W,Z 4W$?Z XZ X"%Z /Y:Z AY  AZ 2Z Z Z *Z 8T%AT  T%(	U1UUZ 
U	VU))V5VVZ V##Z &WZ W!!Z $X  Z XZ "X>>Z YZ  Z/ZZZ 	\A#[<<\__main__)
__future__r   r   r   r   rd   r   rs   r   _compatr   r	   r
   r   coreutilsr   r   r)   rg   r=   rC   rL   r   __name__r&   r'   r(   <module>r      sw    ' A A  	 	 
 2 2  .7t #&**T O. $'::   "% @.tn zF r'   