001 002/* 003 * html2txt - Converts HTML documents to plain text 004 * 005 * Copyright (c) 2015, Arno Unkrig 006 * All rights reserved. 007 * 008 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the 009 * following conditions are met: 010 * 011 * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the 012 * following disclaimer. 013 * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the 014 * following disclaimer in the documentation and/or other materials provided with the distribution. 015 * 3. The name of the author may not be used to endorse or promote products derived from this software without 016 * specific prior written permission. 017 * 018 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 019 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 020 * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 021 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 022 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 023 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 024 * POSSIBILITY OF SUCH DAMAGE. 025 */ 026 027package de.unkrig.html2txt; 028 029import java.io.File; 030import java.io.InputStream; 031import java.io.InputStreamReader; 032import java.io.OutputStreamWriter; 033import java.io.PrintWriter; 034import java.nio.charset.Charset; 035 036import javax.xml.transform.SourceLocator; 037import javax.xml.transform.TransformerException; 038 039import org.xml.sax.SAXParseException; 040 041import de.unkrig.commons.io.IoUtil; 042import de.unkrig.html2txt.Html2Txt.HtmlException; 043 044/** 045 * A command line interface for {@link Html2Txt}. 046 */ 047public final 048class Main { 049 050 private Main() {} 051 052 /** 053 * <h2>Usage:</h2> 054 * 055 * <dl> 056 * <dt>{@code html2txt} [ <var>option</var> ] ... <var>input-file</var></dt> 057 * <dd> 058 * Converts the HTML document in the <var>input-file</var> to plain text, and writes it to STDOUT. 059 * </dd> 060 * <dt>{@code html2txt} [ <var>option</var> ] ... <var>input-file</var> <var>output-file</var></dt> 061 * <dd> 062 * Converts the HTML document in the <var>input-file</var> to plain text, and writes it to the 063 * <var>output-file</var>. 064 * </dd> 065 * </dl> 066 * 067 * <h2>Options:</h2> 068 * 069 * <dl> 070 * <dt>{@code -help}</dt> 071 * <dd> 072 * Print this text and terminate. 073 * </dd> 074 * <dt>{@code -page-width} <var>N</var></dt> 075 * <dd> 076 * The maximum line length to produce. Defaults to the value of the "{@code $COLUMNS}" environment variable, 077 * if set, otherwise to "80". 078 * </dd> 079 * <dt>{@code -encoding} <var>enc</var></dt> 080 * <dd> 081 * The charset to use when reading the input file and writing the output file. 082 * </dd> 083 * <dt>{@code -input-encoding} <var>enc</var></dt> 084 * <dd> 085 * The charset to use when reading the input file. 086 * </dd> 087 * <dt>{@code -output-encoding} <var>enc</var></dt> 088 * <dd> 089 * The charset to use when writing the output file. 090 * </dd> 091 * </dl> 092 */ 093 public static void 094 main(String[] args) throws Exception { 095 096 Html2Txt html2Txt = new Html2Txt(); 097 098 int idx = 0; 099 while (idx < args.length) { 100 String arg = args[idx]; 101 if (!arg.startsWith("-")) break; 102 idx++; 103 if ("-help".equals(arg)) { 104 InputStream is = Main.class.getClassLoader().getResourceAsStream("de/unkrig/html2txt/usage.txt"); 105 IoUtil.copy( 106 new InputStreamReader(is, Charset.forName("UTF-8")), // inputStream 107 true, // closeReader 108 new OutputStreamWriter(System.out), // outputStream 109 false // closeWriter 110 ); 111 return; 112 } else 113 if ("-page-width".equals(arg)) { 114 html2Txt.setPageWidth(Integer.parseInt(args[idx++])); 115 } else 116 if ("-encoding".equals(arg)) { 117 Charset cs = Charset.forName(args[idx++]); 118 html2Txt.setInputCharset(cs); 119 html2Txt.setOutputCharset(cs); 120 } else 121 if ("-input-encoding".equals(arg)) { 122 html2Txt.setInputCharset(Charset.forName(args[idx++])); 123 } else 124 if ("-output-encoding".equals(arg)) { 125 html2Txt.setOutputCharset(Charset.forName(args[idx++])); 126 } else 127 { 128 System.err.println("Invalid command line option \"" + arg + "\"; try \"-help\"."); 129 System.exit(1); 130 return; 131 } 132 } 133 134 try { 135 switch (args.length - idx) { 136 137 case 1: 138 { 139 File inputFile = new File(args[idx++]); 140 html2Txt.html2txt(inputFile, new PrintWriter(System.out)); 141 } 142 break; 143 144 case 2: 145 { 146 File inputFile = new File(args[idx++]); 147 File outputFile = new File(args[idx++]); 148 html2Txt.html2txt(inputFile, outputFile); 149 } 150 break; 151 152 default: 153 System.err.println("Invalid number of command line arguments; try \"-help\"."); 154 System.exit(1); 155 } 156 } catch (SAXParseException spe) { 157 158 String publicId = spe.getPublicId(); 159 System.err.println( 160 (publicId != null ? publicId + ", line " : "Line ") 161 + spe.getLineNumber() 162 + ", column " 163 + spe.getColumnNumber() 164 + ": " 165 + spe.getMessage() 166 + '.' 167 ); 168 System.exit(1); 169 } catch (TransformerException te) { 170 171 SourceLocator l = te.getLocator(); 172 if (l == null) { 173 System.err.println(te.getMessage()); 174 } else { 175 String publicId = l.getPublicId(); // TODO: Do we get the input file path here? 176 System.err.println( 177 (publicId != null ? publicId + ", line " : "Line ") 178 + ", line " 179 + l.getLineNumber() 180 + ", column " 181 + l.getColumnNumber() 182 + ": " 183 + te.getMessage() 184 + '.' 185 ); 186 } 187 System.exit(1); 188 } catch (HtmlException he) { 189 System.err.println(he); 190 System.exit(1); 191 } 192 } 193}