001
002/*
003 * html2txt - Converts HTML documents to plain text
004 *
005 * Copyright (c) 2015, Arno Unkrig
006 * All rights reserved.
007 *
008 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
009 * following conditions are met:
010 *
011 *    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
012 *       following disclaimer.
013 *    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
014 *       following disclaimer in the documentation and/or other materials provided with the distribution.
015 *    3. The name of the author may not be used to endorse or promote products derived from this software without
016 *       specific prior written permission.
017 *
018 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
020 * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
021 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
022 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
023 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
024 * POSSIBILITY OF SUCH DAMAGE.
025 */
026
027package de.unkrig.html2txt;
028
029import java.io.File;
030import java.io.InputStream;
031import java.io.InputStreamReader;
032import java.io.OutputStreamWriter;
033import java.io.PrintWriter;
034import java.nio.charset.Charset;
035
036import javax.xml.transform.SourceLocator;
037import javax.xml.transform.TransformerException;
038
039import org.xml.sax.SAXParseException;
040
041import de.unkrig.commons.io.IoUtil;
042import de.unkrig.html2txt.Html2Txt.HtmlException;
043
044/**
045 * A command line interface for {@link Html2Txt}.
046 */
047public final
048class Main {
049
050    private Main() {}
051
052    /**
053     * <h2>Usage:</h2>
054     *
055     * <dl>
056     *   <dt>{@code html2txt} [ <var>option</var> ] ... <var>input-file</var></dt>
057     *   <dd>
058     *     Converts the HTML document in the <var>input-file</var> to plain text, and writes it to STDOUT.
059     *   </dd>
060     *   <dt>{@code html2txt} [ <var>option</var> ] ... <var>input-file</var> <var>output-file</var></dt>
061     *   <dd>
062     *     Converts the HTML document in the <var>input-file</var> to plain text, and writes it to the
063     *     <var>output-file</var>.
064     *   </dd>
065     * </dl>
066     *
067     * <h2>Options:</h2>
068     *
069     * <dl>
070     *   <dt>{@code -help}</dt>
071     *   <dd>
072     *     Print this text and terminate.
073     *   </dd>
074     *   <dt>{@code -page-width} <var>N</var></dt>
075     *   <dd>
076     *     The maximum line length to produce. Defaults to the value of the "{@code $COLUMNS}" environment variable,
077     *     if set, otherwise to "80".
078     *   </dd>
079     *   <dt>{@code -encoding} <var>enc</var></dt>
080     *   <dd>
081     *     The charset to use when reading the input file and writing the output file.
082     *   </dd>
083     *   <dt>{@code -input-encoding} <var>enc</var></dt>
084     *   <dd>
085     *     The charset to use when reading the input file.
086     *   </dd>
087     *   <dt>{@code -output-encoding} <var>enc</var></dt>
088     *   <dd>
089     *     The charset to use when writing the output file.
090     *   </dd>
091     * </dl>
092     */
093    public static void
094    main(String[] args) throws Exception {
095
096        Html2Txt html2Txt = new Html2Txt();
097
098        int idx = 0;
099        while (idx < args.length) {
100            String arg = args[idx];
101            if (!arg.startsWith("-")) break;
102            idx++;
103            if ("-help".equals(arg)) {
104                InputStream is = Main.class.getClassLoader().getResourceAsStream("de/unkrig/html2txt/usage.txt");
105                IoUtil.copy(
106                    new InputStreamReader(is, Charset.forName("UTF-8")), // inputStream
107                    true,                                                // closeReader
108                    new OutputStreamWriter(System.out),                  // outputStream
109                    false                                                // closeWriter
110                );
111                return;
112            } else
113            if ("-page-width".equals(arg)) {
114                html2Txt.setPageWidth(Integer.parseInt(args[idx++]));
115            } else
116            if ("-encoding".equals(arg)) {
117                Charset cs = Charset.forName(args[idx++]);
118                html2Txt.setInputCharset(cs);
119                html2Txt.setOutputCharset(cs);
120            } else
121            if ("-input-encoding".equals(arg)) {
122                html2Txt.setInputCharset(Charset.forName(args[idx++]));
123            } else
124            if ("-output-encoding".equals(arg)) {
125                html2Txt.setOutputCharset(Charset.forName(args[idx++]));
126            } else
127            {
128                System.err.println("Invalid command line option \"" + arg + "\"; try \"-help\".");
129                System.exit(1);
130                return;
131            }
132        }
133
134        try {
135            switch (args.length - idx)  {
136
137            case 1:
138                {
139                    File inputFile = new File(args[idx++]);
140                    html2Txt.html2txt(inputFile, new PrintWriter(System.out));
141                }
142                break;
143
144            case 2:
145                {
146                    File inputFile  = new File(args[idx++]);
147                    File outputFile = new File(args[idx++]);
148                    html2Txt.html2txt(inputFile, outputFile);
149                }
150                break;
151
152            default:
153                System.err.println("Invalid number of command line arguments; try \"-help\".");
154                System.exit(1);
155            }
156        } catch (SAXParseException spe) {
157
158            String publicId = spe.getPublicId();
159            System.err.println(
160                (publicId != null ? publicId + ", line " : "Line ")
161                + spe.getLineNumber()
162                + ", column "
163                + spe.getColumnNumber()
164                + ": "
165                + spe.getMessage()
166                + '.'
167            );
168            System.exit(1);
169        } catch (TransformerException te) {
170
171            SourceLocator l = te.getLocator();
172            if (l == null) {
173                System.err.println(te.getMessage());
174            } else {
175                String publicId = l.getPublicId(); // TODO: Do we get the input file path here?
176                System.err.println(
177                    (publicId != null ? publicId + ", line " : "Line ")
178                    + ", line "
179                    + l.getLineNumber()
180                    + ", column "
181                    + l.getColumnNumber()
182                    + ": "
183                    + te.getMessage()
184                    + '.'
185                );
186            }
187            System.exit(1);
188        } catch (HtmlException he) {
189            System.err.println(he);
190            System.exit(1);
191        }
192    }
193}