001
002/*
003 * html2txt - Converts HTML documents to plain text
004 *
005 * Copyright (c) 2015, Arno Unkrig
006 * All rights reserved.
007 *
008 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
009 * following conditions are met:
010 *
011 *    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
012 *       following disclaimer.
013 *    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
014 *       following disclaimer in the documentation and/or other materials provided with the distribution.
015 *    3. The name of the author may not be used to endorse or promote products derived from this software without
016 *       specific prior written permission.
017 *
018 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
020 * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
021 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
022 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
023 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
024 * POSSIBILITY OF SUCH DAMAGE.
025 */
026
027package de.unkrig.html2txt;
028
029import java.io.File;
030import java.io.PrintWriter;
031import java.io.Reader;
032import java.io.Writer;
033import java.nio.charset.Charset;
034import java.util.ArrayList;
035import java.util.Collections;
036import java.util.List;
037import java.util.Map;
038
039import javax.xml.parsers.DocumentBuilder;
040import javax.xml.parsers.DocumentBuilderFactory;
041import javax.xml.parsers.ParserConfigurationException;
042import javax.xml.transform.TransformerException;
043
044import org.w3c.dom.Attr;
045import org.w3c.dom.Document;
046import org.w3c.dom.Element;
047import org.w3c.dom.Node;
048import org.xml.sax.ErrorHandler;
049import org.xml.sax.InputSource;
050import org.xml.sax.Locator;
051import org.xml.sax.SAXException;
052import org.xml.sax.SAXParseException;
053
054import de.unkrig.commons.io.IoUtil;
055import de.unkrig.commons.io.LineUtil;
056import de.unkrig.commons.lang.AssertionUtil;
057import de.unkrig.commons.lang.StringUtil;
058import de.unkrig.commons.lang.protocol.Consumer;
059import de.unkrig.commons.lang.protocol.ConsumerUtil;
060import de.unkrig.commons.lang.protocol.ConsumerWhichThrows;
061import de.unkrig.commons.lang.protocol.Producer;
062import de.unkrig.commons.nullanalysis.Nullable;
063import de.unkrig.commons.text.xml.XmlUtil;
064import de.unkrig.commons.util.collections.CollectionUtil;
065
066/**
067 * A converter that turns an HTML document into plain text, using spaces and various punctuation characters to format
068 * it.
069 * <p>
070 *   One important restriction is that the HTML document must be "well-formed", i.e. all opening tags must be
071 *   <i>exactly</i> matched by closing tags, i.e.:
072 * </p>
073 * <pre>
074 * Let's &lt;i>emphasize&lt;/i>.
075 * &lt;ul>
076 *   &lt;li>List items&lt;/li>
077 *   &lt;li>must be terminated with "&lt;tt>&amp;lt;/li>&lt;/tt>".
078 * &lt;/ul>
079 * &lt;br />
080 * &lt;hr />
081 * </pre>
082 */
083public
084class Html2Txt {
085
086    static { AssertionUtil.enableAssertionsForThisClass(); }
087
088    /** All methods of this {@link ErrorHandler} throw the {@link SAXException} they recieve. */
089    @SuppressWarnings("null")
090    public static final ErrorHandler
091    SIMPLE_SAX_ERROR_HANDLER = new ErrorHandler() {
092        @Override public void warning(@Nullable SAXParseException e)    throws SAXParseException { throw e; }
093        @Override public void fatalError(@Nullable SAXParseException e) throws SAXParseException { throw e; }
094        @Override public void error(@Nullable SAXParseException e)      throws SAXParseException { throw e; }
095    };
096
097    /** All methods of theis {@link HtmlErrorHandler} throw the {@link HtmlException} they recieve. */
098    public static final HtmlErrorHandler
099    SIMPLE_HTML_ERROR_HANDLER = new HtmlErrorHandler() {
100        @Override public void warning(HtmlException e)    throws HtmlException { throw e; }
101        @Override public void fatalError(HtmlException e) throws HtmlException { throw e; }
102        @Override public void error(HtmlException e)      throws HtmlException { throw e; }
103    };
104
105    /**
106     * The handler for any HTML-related warnings, errors and fatal errors that may occur during conversion.
107     */
108    HtmlErrorHandler htmlErrorHandler = Html2Txt.SIMPLE_HTML_ERROR_HANDLER;
109
110    private int     pageLeftMarginWidth  /*= 0*/;
111    private int     pageRightMarginWidth = 1;
112    private Charset inputCharset         = Charset.defaultCharset();
113    private Charset outputCharset        = Charset.defaultCharset();
114    private int     pageWidth;
115
116    {
117        try {
118            this.pageWidth = Integer.parseInt(System.getenv("COLUMNS"));
119        } catch (Exception e) {
120            this.pageWidth = 80;
121        }
122    }
123
124    /**
125     * Representation of an exceptional condition that occurred during HTML processing. This exception is always
126     * related to a node in the HTML DOM.
127     */
128    public static
129    class HtmlException extends Exception {
130
131        private static final long serialVersionUID = 1L;
132
133        private final Node node;
134
135        public
136        HtmlException(Node node, String message) {
137            super(message);
138            this.node = node;
139        }
140
141        @Override public String
142        toString() {
143
144            String s = this.getClass().getName();
145
146            {
147                Locator l = XmlUtil.getLocation(this.node);
148                if (l != null) {
149                    String publicId = l.getPublicId();
150                    if (publicId != null) s += ", " + publicId;
151                    s += ", line " + l.getLineNumber() + ", column " + l.getColumnNumber();
152                }
153            }
154
155            {
156                String message = this.getLocalizedMessage();
157                if (message != null) s += ": " + message;
158            }
159
160            return s;
161        }
162    }
163
164    /** Handles {@link HtmlException}s. */
165    public
166    interface HtmlErrorHandler {
167        // SUPPRESS CHECKSTYLE JavadocMethod:3
168        void warning(HtmlException e)    throws HtmlException;
169        void fatalError(HtmlException e) throws HtmlException;
170        void error(HtmlException e)      throws HtmlException;
171    }
172
173    /**
174     * Formats an HTML block element.
175     *
176     * @see Html2Txt#ALL_BLOCK_ELEMENTS
177     */
178    public
179    interface BlockElementFormatter {
180
181        /**
182         * Appends lines to the <var>output</var>. The first <var>leftMarginWidth</var> characters of each produced
183         * line are spaces (except for the first line, where the string produced by {@link Html2Txt.Bulleting#next()}
184         * is placed in the left margin), followed by up to <var>measure</var> characters.
185         */
186        void
187        format(
188            Html2Txt                       html2Txt,
189            int                            leftMarginWidth,
190            Bulleting                      bulleting,
191            int                            measure,
192            Element                        element,
193            Consumer<? super CharSequence> output
194        ) throws HtmlException;
195    }
196
197    /**
198     * Formats an HTML inline element.
199     *
200     * @see Html2Txt#ALL_INLINE_ELEMENTS
201     */
202    public
203    interface InlineElementFormatter {
204
205        /**
206         * Appends characters to the <var>output</var>; "{@code \n}" represents a "break" ("{@code <br />}").
207         */
208        void format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException;
209    }
210
211    interface Bulleting {
212
213        /**
214         * @return The text for the "next" bullet, e.g. "7.", "G.", "vii."
215         */
216        String next();
217
218        /**
219         * {@link #next()} always returns the empty string.
220         */
221        Bulleting NONE = new Bulleting() {  @Override public String next() { return ""; } };
222    }
223
224    enum NumberingType {
225
226        /**
227         * <dl>
228         *   <dt>0</dt><dd>({@code NumberFormatException})</dd>
229         *   <dt>1</dt><dd>"{@code a}"</dd>
230         *   <dt>2</dt><dd>"{@code b}"</dd>
231         *   <dt>26</dt><dd>"{@code z}"</dd>
232         *   <dt>27</dt><dd>"{@code aa}"</dd>
233         *   <dt>28</dt><dd>"{@code ab}"</dd>
234         *   <dt>702</dt><dd>"{@code zz}"</dd>
235         *   <dt>703</dt><dd>"{@code aaa}"</dd>
236         * </dl>
237         * Etc.
238         */
239        LOWERCASE_LETTERS {
240
241            @Override public long
242            parse(String s) {
243                long result = 0;
244                for (int i = 0; i < s.length(); i++) {
245                    char c = s.charAt(i);
246                    if (c >= 'A' && c <= 'Z') {
247                        result = 26 * result + (c - 'A') + 1;
248                    } else
249                    if (c >= 'a' && c <= 'z') {
250                        result = 26 * result + (c - 'a') + 1;
251                    } else
252                    {
253                        throw new NumberFormatException();
254                    }
255                }
256                return result;
257            }
258
259            @Override public String
260            toString(long value) {
261
262                if (value < 0) return '-' + this.toString(-value);
263                if (value == 0) throw new NumberFormatException();
264                if (value <= 26) return String.valueOf((char) (value + 'a' - 1));
265                return this.toString(value / 26) + ((char) ((value % 26) + 'a' - 1));
266            }
267        },
268
269        /**
270         * <dl>
271         *   <dt>0</dt><dd>({@code NumberFormatException})</dd>
272         *   <dt>1</dt><dd>"{@code A}"</dd>
273         *   <dt>2</dt><dd>"{@code B}"</dd>
274         *   <dt>26</dt><dd>"{@code Z}"</dd>
275         *   <dt>27</dt><dd>"{@code AA}"</dd>
276         *   <dt>28</dt><dd>"{@code AB}"</dd>
277         *   <dt>702</dt><dd>"{@code ZZ}"</dd>
278         *   <dt>703</dt><dd>"{@code AAA}"</dd>
279         * </dl>
280         * Etc.
281         */
282        UPPERCASE_LETTERS {
283
284            @Override public long parse(String s) { return LOWERCASE_LETTERS.parse(s); }
285
286            @Override public String
287            toString(long value) {
288
289                if (value < 0) return '-' + this.toString(-value);
290                if (value == 0) throw new NumberFormatException();
291                if (value <= 26) return String.valueOf((char) (value + 'A' - 1));
292                return this.toString(value / 26) + ((char) ((value % 26) + 'A' - 1));
293            }
294        },
295
296        /**
297         * <dl>
298         *   <dt>0</dt><dd>({@code NumberFormatException})</dd>
299         *   <dt>1</dt><dd>"{@code i}"</dd>
300         *   <dt>2</dt><dd>"{@code ii}"</dd>
301         *   <dt>3</dt><dd>"{@code iii}"</dd>
302         *   <dt>4</dt><dd>"{@code iv}"</dd>
303         *   <dt>9999</dt><dd>"{@code mmmmmmmmmcmlcix}"</dd>
304         *   <dt>10000</dt><dd>({@code NumberFormatException})</dd>
305         * </dl>
306         * Etc.
307         */
308        LOWERCASE_ROMAN_NUMERALS {
309
310            private final String[][] ds = {
311                " i ii iii iv v vi vii viii ix".split(" "),
312                " x xx xxx xl l lx lxx lxxx lc".split(" "),
313                " c cc ccc cd d dc dcc dccc cm".split(" "),
314                " m mm mmm mmmm mmmmm mmmmmm mmmmmmm mmmmmmmm mmmmmmmmm".split(" "),
315            };
316
317            @Override public long
318            parse(String s) {
319                if (s.isEmpty()) throw new NumberFormatException();
320                s = s.toLowerCase();
321
322                long result = 0;
323                for (int i = 3; i >= 0; i--) {
324                    for (int j = 9;; j--) {
325                        String d = this.ds[i][j];
326                        if (s.startsWith(d)) {
327                            result = 10 * result + j;
328                            break;
329                        }
330                    }
331                }
332                return result;
333            }
334
335            @Override public String
336            toString(long value) {
337                if (value == 0) throw new NumberFormatException();
338                if (value < 0) return '-' + this.toString(-value);
339                if (value >= 10000) throw new NumberFormatException();
340
341                if (value <= 9) return this.ds[0][(int) value];
342                StringBuilder sb = new StringBuilder();
343                if (value >= 1000) {
344                    sb.append(this.ds[3][(int) value / 1000]);
345                    value %= 1000;
346                }
347                if (value >= 100) {
348                    sb.append(this.ds[2][(int) value / 100]);
349                    value %= 100;
350                }
351                if (value >= 10) {
352                    sb.append(this.ds[1][(int) value / 10]);
353                    value %= 10;
354                }
355                if (value >= 1) {
356                    sb.append(this.ds[0][(int) value]);
357                }
358                return sb.toString();
359            }
360        },
361
362        /**
363         * <dl>
364         *   <dt>0</dt><dd>({@code NumberFormatException})</dd>
365         *   <dt>1</dt><dd>"{@code I}"</dd>
366         *   <dt>2</dt><dd>"{@code II}"</dd>
367         *   <dt>3</dt><dd>"{@code III}"</dd>
368         *   <dt>4</dt><dd>"{@code IV}"</dd>
369         *   <dt>9999</dt><dd>"{@code MMMMMMMMMCMLCIX}"</dd>
370         *   <dt>10000</dt><dd>({@code NumberFormatException})</dd>
371         * </dl>
372         * Etc.
373         */
374        UPPERCASE_ROMAN_LITERALS {
375            @Override public long   parse(String s)      { return NumberingType.LOWERCASE_ROMAN_NUMERALS.parse(s); }
376            @Override public String toString(long value) { return LOWERCASE_ROMAN_NUMERALS.toString().toUpperCase(); }
377        },
378
379        /**
380         * @see Long#parseLong(String)
381         * @see Long#toString(long)
382         */
383        ARABIC_DIGITS {
384            @Override public long   parse(String s)      { return Long.parseLong(s); }
385            @Override public String toString(long value) { return Long.toString(value); }
386        };
387
388        /**
389         * Converts the given string to an integral value.
390         */
391        public abstract long parse(String s);
392
393        /**
394         * Converts the given integral value to a string. Notice that some {@link NumberingType}s do not support the
395         * value zero, or numbers greater than 9999.
396         */
397        public abstract String toString(long value);
398    }
399
400    /**
401     * Sets a custom {@link HtmlErrorHandler} on this object. The default handler is {@link
402     * #SIMPLE_HTML_ERROR_HANDLER}.
403     */
404    public Html2Txt
405    setErrorHandler(HtmlErrorHandler htmlErrorHandler) {
406        this.htmlErrorHandler = htmlErrorHandler;
407        return this;
408    }
409
410    /**
411     * The number of spaces that preceeds each line of output; defaults to zero.
412     */
413    public Html2Txt
414    setPageLeftMarginWidth(int pageLeftMarginWidth) {
415        this.pageLeftMarginWidth = pageLeftMarginWidth;
416        return this;
417    }
418
419    /**
420     * The maximum length of output lines is "<var>pageWidth</var> - <var>rightMarginWidth</var>".
421     * <p>
422     *   Defaults to "{@code 1}", to avoid extra line wraps on certain terminals.
423     * </p>
424     *
425     * @see #setPageWidth(int)
426     */
427    public Html2Txt
428    setPageRightMarginWidth(int pageRightMarginWidth) {
429        this.pageRightMarginWidth = pageRightMarginWidth;
430        return this;
431    }
432
433    /**
434     * Sets the charset to use when reading HTML input files. Defaults to the {@link Charset#defaultCharset() JVM
435     * default charset}.
436     */
437    public void
438    setInputCharset(Charset cs) {
439        this.inputCharset = cs;
440    }
441
442    /**
443     * Sets the charset to use when writing text input files. Defaults to the {@link Charset#defaultCharset() JVM
444     * default charset}.
445     */
446    public void
447    setOutputCharset(Charset cs) {
448        this.outputCharset = cs;
449    }
450
451    /**
452     * The maximum length of output lines is "<var>pageWidth</var> - <var>rightMarginWidth</var>".
453     * <p>
454     *   Defaults to the value of the environment variable "{@code $COLUMNS}", or, if that is not set, to 80.
455     * </p>
456     *
457     * @see #setPageRightMarginWidth(int)
458     */
459    public Html2Txt
460    setPageWidth(int pageWidth) { this.pageWidth = pageWidth; return this; }
461
462    /**
463     * Reads, scans and parses the HTML document in the {@code inputFile}, generates a plain text document, and
464     * writes it to the {@code output}.
465     */
466    public void
467    html2txt(File inputFile, Writer output)
468    throws ParserConfigurationException, SAXException, TransformerException, HtmlException {
469
470        DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
471        db.setErrorHandler(Html2Txt.SIMPLE_SAX_ERROR_HANDLER);
472
473        Document document = XmlUtil.parse(db, inputFile, this.inputCharset.name());
474
475        this.html2txt(document, output);
476    }
477
478    /**
479     * Reads, scans and parses the HTML document in the {@code inputFile}, generates a plain text document, and
480     * writes it to the {@code output}.
481     */
482    public void
483    html2txt(Reader input, Writer output)
484    throws ParserConfigurationException, SAXException, TransformerException, HtmlException {
485
486        DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
487        db.setErrorHandler(Html2Txt.SIMPLE_SAX_ERROR_HANDLER);
488
489        InputSource inputSource = new InputSource();
490        inputSource.setCharacterStream(input);
491
492        Document document = XmlUtil.parse(db, inputSource);
493
494        this.html2txt(document, output);
495    }
496
497    /**
498     * Generates a plain text document from the given HTML <var>document</var>, and writes it to the {@code output}.
499     */
500    public void
501    html2txt(final Document document, Writer output) throws HtmlException {
502
503        document.getDocumentElement().normalize();
504
505        PrintWriter pw = output instanceof PrintWriter ? (PrintWriter) output : new PrintWriter(output);
506
507        this.html2txt(document, LineUtil.lineConsumer(pw));
508    }
509
510    /**
511     * Reads, scans and parses the HTML document in the {@code inputFile}, generates a plain text document, and
512     * writes it to the {@code outputFile}.
513     */
514    public void
515    html2txt(final File inputFile, File outputFile) throws Exception {
516
517        IoUtil.outputFilePrintWriter(
518            outputFile,
519            this.outputCharset,
520            new ConsumerWhichThrows<PrintWriter, Exception>() {
521
522                @Override public void
523                consume(PrintWriter pw) throws Exception { Html2Txt.this.html2txt(inputFile, pw); }
524            }
525        );
526    }
527
528    private void
529    html2txt(Document document, Consumer<? super CharSequence> output) throws HtmlException {
530
531        // Some block tags render vertical space, which we want to compress.
532        output = ConsumerUtil.<CharSequence>compress(output, StringUtil.IS_BLANK, "");
533
534        // Some formatters render trailing spaces (esp. the TABLE_FORMATTER), which we also want to suppress.
535        output = Html2Txt.rightTrim(output);
536
537        Element documentElement = document.getDocumentElement();
538
539        // Iff the document is structured like
540        //
541        //     <html>
542        //       ...
543        //       <body>...</body>
544        //       ...
545        //       <body>...</body>
546        //       ...
547        //     </html>
548        //     ...
549        //
550        // , then the result is the formatted <body>s.
551        if ("html".equals(documentElement.getNodeName())) {
552            for (Node n : XmlUtil.iterable(documentElement.getChildNodes())) {
553                if (n.getNodeType() == Node.ELEMENT_NODE && "body".equals(n.getNodeName())) {
554                    Element bodyElement = (Element) n;
555                    this.formatBlocks(
556                        this.pageLeftMarginWidth,
557                        Bulleting.NONE,
558                        Bulleting.NONE,
559                        this.pageWidth - this.pageLeftMarginWidth - this.pageRightMarginWidth,
560                        XmlUtil.iterable(bodyElement.getChildNodes()),
561                        output
562                    );
563                }
564            }
565
566            return;
567        }
568
569        // Otherwise, assume that the document poses an HTML *fragment*, and the top level nodes ar *blocks*.
570        this.formatBlocks(
571            this.pageLeftMarginWidth,
572            Bulleting.NONE,
573            Bulleting.NONE,
574            this.pageWidth - this.pageLeftMarginWidth - this.pageRightMarginWidth,
575            Collections.singletonList(documentElement),
576            output
577        );
578    }
579
580    /**
581     * Formats a sequence of {@link Node#TEXT_NODE TEXT} nodes and HTML inline or block {@link Node#ELEMENT_NODE
582     * ELEMENT} nodes.
583     */
584    <N extends Node> void
585    formatBlocks(
586        int                            leftMarginWidth,
587        Bulleting                      inlineSubelementsBulleting,
588        Bulleting                      blockSubelementsBulleting,
589        int                            measure,
590        Iterable<N>                    nodes,
591        Consumer<? super CharSequence> output
592    ) throws HtmlException {
593
594        List<Node> inlineNodes = new ArrayList<Node>();
595        for (Node n : nodes) {
596            if (n.getNodeType() == Node.TEXT_NODE) {
597                inlineNodes.add(n);
598            } else
599            if (Html2Txt.isInlineElement(n)) {
600                inlineNodes.add(n);
601            } else
602            if (Html2Txt.isBlockElement(n)) {
603                if (!inlineNodes.isEmpty()) {
604                    this.wordWrap(
605                        leftMarginWidth,
606                        inlineSubelementsBulleting,
607                        measure,
608                        this.getBlock(inlineNodes),
609                        output
610                    );
611                    inlineNodes.clear();
612                }
613
614                Element e = (Element) n;
615
616                BlockElementFormatter bef = Html2Txt.ALL_BLOCK_ELEMENTS.get(e.getTagName());
617                if (bef == null) {
618                    this.htmlErrorHandler.error(
619                        new HtmlException(n, "Unexpected block element \"" + XmlUtil.toString(e) + "\" in block")
620                    );
621                } else {
622                    bef.format(this, leftMarginWidth, blockSubelementsBulleting, measure, e, output);
623                }
624            } else
625            {
626                this.htmlErrorHandler.error(
627                    new HtmlException(n, "Unexpected node \"" + XmlUtil.toString(n) + "\" in <body>")
628                );
629            }
630        }
631
632        if (!inlineNodes.isEmpty()) {
633            this.wordWrap(
634                leftMarginWidth,
635                inlineSubelementsBulleting,
636                measure,
637                this.getBlock(inlineNodes),
638                output
639            );
640            inlineNodes.clear();
641        }
642    }
643
644    /**
645     * The given <var>text</var> is word-wrapped such that each output line begins with <var>leftMarginWidth</var>
646     * spaces, followed by up to <var>measure</var> characters. If the <var>text</var> contains very long words, then
647     * some of the output lines may be longer than "<var>leftMarginWidth</var> + <var>measure</var>".
648     * <p>
649     *   Newline characters ({@code '\n'}) appear as line breaks in the output.
650     * </p>
651     * <p>
652     *   The output lines are fed to the <var>lc</var>.
653     * </p>
654     * @param bulleting The string produced by {@link Bulleting#next()} is placed in the left margin of the first
655     *                  line generated
656     */
657    private void
658    wordWrap(
659        int                            leftMarginWidth,
660        Bulleting                      bulleting,
661        int                            measure,
662        String                         text,
663        Consumer<? super CharSequence> output
664    ) throws HtmlException {
665
666        text = text.trim();
667        if (text.length() == 0) return;
668
669        if (measure < 1) measure = 1;
670
671        // From this point on, the first letter of "text" is always a non-space character.
672
673        for (int nlidx = text.indexOf('\n'); nlidx != -1; nlidx = text.indexOf('\n')) {
674            this.wordWrap(leftMarginWidth, bulleting, measure, text.substring(0, nlidx), output);
675            for (nlidx++; nlidx < text.length() && text.charAt(nlidx) == ' '; nlidx++);
676            if (nlidx == text.length()) return;
677            text = text.substring(nlidx);
678        }
679
680        String continuationLineLeftMargin = StringUtil.repeat(leftMarginWidth, ' ');
681        String leftMargin;
682        {
683            String bullet = bulleting.next();
684            if (bullet.length() == 0) {
685                leftMargin = continuationLineLeftMargin;
686            } else
687            if (bullet.length() + 1 < leftMarginWidth) {
688                leftMargin = StringUtil.repeat(leftMarginWidth - bullet.length() - 1, ' ') + bullet + ' ';
689            } else
690            {
691                leftMargin = bullet + ' ';
692            }
693        }
694
695        for (;;) {
696
697            if (text.length() <= measure) break;
698
699            // Determine the point to wrap at.
700            int idx1; // Space after the last word to keep in THIS line.
701            int idx2; // First letter of the first word to put on the NEXT line.
702            IDXS:
703            if (text.charAt(measure) == ' ') {
704                for (idx1 = measure; idx1 > 0 && text.charAt(idx1 - 1) == ' '; idx1--);
705                for (idx2 = measure + 1; idx2 < text.length() && text.charAt(idx2) == ' '; idx2++);
706            } else
707            {
708                for (idx2 = measure; idx2 > 0 && text.charAt(idx2 - 1) != ' '; idx2--) {
709                    if (text.charAt(idx2 - 1) == '-') {
710                        idx1 = idx2;
711                        break IDXS;
712                    }
713                }
714                if (idx2 == 0) {
715                    for (idx1 = measure + 1; idx1 < text.length() && text.charAt(idx1) != ' '; idx1++);
716                    if (idx1 == text.length()) break;
717                    for (idx2 = idx1 + 1; idx2 < text.length() && text.charAt(idx2) == ' '; idx2++);
718                    if (idx2 == text.length()) {
719                        text = text.substring(0, idx1);
720                        break;
721                    }
722                } else {
723                    for (idx1 = idx2 - 1; text.charAt(idx1 - 1) == ' '; idx1--);
724                }
725            }
726
727            output.consume(leftMargin + text.substring(0, idx1));
728
729            text = text.substring(idx2);
730
731            leftMargin = continuationLineLeftMargin;
732        }
733
734        output.consume(leftMargin + text);
735    }
736
737    /**
738     * Formats text and inline elements into one long line, except for "{@code <br />}" tags, which map into
739     * line breaks.
740     */
741    private String
742    getBlock(Iterable<Node> nodes) throws HtmlException {
743        StringBuilder sb = new StringBuilder();
744
745        for (Node n : nodes) {
746            short nodeType = n.getNodeType();
747
748            if (nodeType == Node.TEXT_NODE) {
749                String content = n.getTextContent();
750                sb.append(content.replaceAll("\\s+", " "));
751            } else
752            if (nodeType == Node.ELEMENT_NODE) {
753                Element e = (Element) n;
754
755                InlineElementFormatter ief = Html2Txt.ALL_INLINE_ELEMENTS.get(e.getTagName());
756                if (ief == null) {
757                    this.htmlErrorHandler.error(
758                        new HtmlException(n, "Unexpected element \"" + XmlUtil.toString(e) + "\" in block")
759                    );
760                } else {
761                    ief.format(this, e, sb);
762                }
763            } else
764            {
765                this.htmlErrorHandler.error(new HtmlException(n, "Unexpected node in block"));
766            }
767        }
768        return sb.toString();
769    }
770
771    /**
772     * "Block-Level" is categorization of HTML elements, as contrasted with "inline" elements.
773     * <p>
774     *   Block-level elements may appear only within a {@code <body>} element.
775     *   Their most significant characteristic is that they typically are formatted with a line break before and after
776     *   the element (thereby creating a stand-alone block of content). That is, they take up the width of their
777     *   containers.
778     * </p>
779     * <p>
780     *   The distinction of block-level vs. inline elements is used in HTML specifications up  to 4.01. In HTML5, this
781     *   binary distinction is replaced with a more complex set of content categories. The "block-level" category
782     *   roughly corresponds to the category of flow content in HTML5, while "inline" corresponds to phrasing content,
783     *   but there are additional categories.
784     * </p>
785     * <p>
786     *   There are a couple of key differences between block-level elements and inline elements:
787     * </p>
788     * <dl>
789     *   <dt>Formatting</dt>
790     *   <dd>
791     *     By default, block-level elements begin on new lines.
792     *   </dd>
793     *   <dt>Content model</dt>
794     *   <dd>
795     *     Generally, block-level elements may contain inline elements and other block-level elements. Inherent in
796     *     this structural distinction is the idea that block elements create "larger" structures than inline elements.
797     *   </dd>
798     * </dl>
799     * <p>
800     *   Quoted from <a href="https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements">Mozilla Developer
801     *   Network, "Block-level Elements"</a>.
802     * </p>
803     *
804     * <p>
805     *   See also <a href="http://www.w3schools.com/html/html_blocks.asp">HTML Tutorial, section "HTML Block
806     *   Elements"</a>.
807     * </p>
808     *
809     * @return Whether the given {@code node} is one of the "block elements" by the HTML standard
810     */
811    private static boolean
812    isBlockElement(Node node) {
813
814        if (node.getNodeType() != Node.ELEMENT_NODE) return false;
815        Element e = (Element) node;
816
817        return Html2Txt.ALL_BLOCK_ELEMENTS.containsKey(e.getTagName());
818    }
819
820    /**
821     * @param tagName E.g. "{@code table}"
822     */
823    @Nullable static Element
824    isElement(Node node, String tagName) {
825
826        if (node.getNodeType() != Node.ELEMENT_NODE) return null;
827        Element e = (Element) node;
828
829        return tagName.equals(e.getTagName()) ? e : null;
830    }
831
832    private static final BlockElementFormatter
833    HR_FORMATTER = new BlockElementFormatter() {
834
835        @Override public void
836        format(
837            Html2Txt                       html2Txt,
838            int                            leftMarginWidth,
839            Bulleting                      bulleting,
840            int                            measure,
841            Element                        element,
842            Consumer<? super CharSequence> output
843        ) {
844            output.consume(StringUtil.repeat(leftMarginWidth, ' ') + StringUtil.repeat(measure, '-'));
845        }
846    };
847
848    /**
849     * Formatter for the "{@code <ol>}" ("ordered list") HTML block element.
850     */
851    protected static final BlockElementFormatter
852    OL_FORMATTER = new BlockElementFormatter() {
853
854        @Override public void
855        format(
856            Html2Txt                       html2Txt,
857            int                            leftMarginWidth,
858            Bulleting                      bulleting,
859            int                            measure,
860            Element                        element,
861            Consumer<? super CharSequence> output
862        ) throws HtmlException {
863
864            // Determine the OL type.
865            final NumberingType numberingType;
866            {
867                Attr s = element.getAttributeNode("type");
868                if (s == null) {
869                    numberingType = NumberingType.ARABIC_DIGITS;
870                } else {
871                    String value = s.getValue();
872                    numberingType = (
873                        "a".equals(value) ? NumberingType.LOWERCASE_LETTERS :
874                        "A".equals(value) ? NumberingType.UPPERCASE_LETTERS :
875                        "i".equals(value) ? NumberingType.LOWERCASE_ROMAN_NUMERALS :
876                        "I".equals(value) ? NumberingType.UPPERCASE_ROMAN_LITERALS :
877                        NumberingType.ARABIC_DIGITS
878                    );
879                }
880            }
881
882            // Compute the index to start from.
883            final int start;
884            {
885                int tmp;
886                try {
887                    tmp = Integer.parseInt(element.getAttribute("start"));
888                } catch (Exception e) {
889                    tmp = 1;
890                }
891                start = tmp;
892            }
893
894            html2Txt.formatBlocks(
895                leftMarginWidth + 5,
896                Bulleting.NONE,        // inlineSubelementsBulleting
897                new Bulleting() {      // blockSubelementsBulleting
898                    int nextValue = start;
899                    @Override public String next() { return numberingType.toString(this.nextValue++) + "."; }
900                },
901                measure - 5,
902                XmlUtil.iterable(element.getChildNodes()),
903                output
904            );
905        }
906    };
907
908    private static final BlockElementFormatter
909    LI_FORMATTER = new BlockElementFormatter() {
910
911        @Override public void
912        format(
913            Html2Txt                       html2Txt,
914            int                            leftMarginWidth,
915            Bulleting                      bulleting,
916            int                            measure,
917            Element                        element,
918            Consumer<? super CharSequence> output
919        ) throws HtmlException {
920
921            html2Txt.formatBlocks(
922                leftMarginWidth,
923                bulleting,      // inlineSubelementsBulleting
924                Bulleting.NONE, // blockSubelementsBulleting
925                measure,
926                XmlUtil.iterable(element.getChildNodes()),
927                output
928            );
929        }
930    };
931
932    private static final BlockElementFormatter
933    PRE_FORMATTER = new BlockElementFormatter() {
934
935        @Override public void
936        format(
937            Html2Txt                       html2Txt,
938            int                            leftMarginWidth,
939            Bulleting                      bulleting,
940            int                            measure,
941            Element                        element,
942            Consumer<? super CharSequence> output
943        ) throws HtmlException {
944
945            StringBuilder sb = new StringBuilder();
946            for (Node n : XmlUtil.iterable(element.getChildNodes())) {
947                short nodeType = n.getNodeType();
948
949                if (nodeType == Node.TEXT_NODE) {
950                    sb.append(n.getTextContent());
951                } else
952                if (nodeType == Node.ELEMENT_NODE) {
953                    Element e = (Element) n;
954
955                    InlineElementFormatter ief = Html2Txt.ALL_INLINE_ELEMENTS.get(e.getTagName());
956                    if (ief == null) {
957                        html2Txt.htmlErrorHandler.error(
958                            new HtmlException(n, "Unexpected element \"" + XmlUtil.toString(e) + "\" in <pre>")
959                        );
960                    } else {
961                        ief.format(html2Txt, e, sb);
962                    }
963                } else
964                {
965                    html2Txt.htmlErrorHandler.error(new HtmlException(n, "Unexpected node in <pre>"));
966                }
967            }
968
969            Producer<? extends CharSequence> lp = LineUtil.lineProducer(sb);
970            for (boolean first = true;; first = false) {
971
972                CharSequence line = lp.produce();
973                if (line == null) break;
974
975                // Ignore leading empty lines.
976                if (first && line.length() == 0) continue;
977
978                if (first) {
979                    String bullet = bulleting.next();
980                    if (bullet.length() + 1 > leftMarginWidth) {
981                        line = bullet + ' ' + line;
982                    } else {
983                        line = StringUtil.repeat(leftMarginWidth - bullet.length() - 1, ' ') + bullet + ' ' + line;
984                    }
985                }
986
987                output.consume(line);
988            }
989        }
990    };
991
992    /**
993     * Formatter for the "{@code <table>}" HTML block element.
994     */
995    protected static final BlockElementFormatter
996    TABLE_FORMATTER = new TableFormatter();
997
998    /**
999     * @return The length of the longest of the <var>css</var>, or {@code 0} iff <var>css</var> is empty
1000     */
1001    public static int
1002    maxLength(Iterable<? extends CharSequence> css) {
1003
1004        int result = 0;
1005        for (CharSequence cs : css) {
1006            int len = cs.length();
1007            if (len > result) result = len;
1008        }
1009
1010        return result;
1011    }
1012
1013    private static final BlockElementFormatter
1014    UL_FORMATTER = new BlockElementFormatter() {
1015
1016        @Override public void
1017        format(
1018            Html2Txt                       html2Txt,
1019            int                            leftMarginWidth,
1020            Bulleting                      bulleting,
1021            int                            measure,
1022            Element                        element,
1023            Consumer<? super CharSequence> output
1024        ) throws HtmlException {
1025
1026            html2Txt.formatBlocks(
1027                leftMarginWidth + 3,
1028                Bulleting.NONE,
1029                new Bulleting() { @Override public String next() { return "*"; } },
1030                measure - 3,
1031                XmlUtil.iterable(element.getChildNodes()),
1032                output
1033            );
1034        }
1035    };
1036
1037    private static
1038    class HeadingBlockElementFormatter implements BlockElementFormatter {
1039
1040        private boolean          emptyLineAbove, emptyLineBelow;
1041        @Nullable private String prefix, suffix;
1042        private int              underline = -1;
1043
1044        public
1045        HeadingBlockElementFormatter(String prefix, String suffix) {
1046            this.prefix = prefix;
1047            this.suffix = suffix;
1048        }
1049
1050        public
1051        HeadingBlockElementFormatter(boolean emptyLineAbove, char underline, boolean emptyLineBelow) {
1052            this.emptyLineAbove = emptyLineAbove;
1053            this.underline      = underline;
1054            this.emptyLineBelow = emptyLineBelow;
1055        }
1056
1057        @Override public void
1058        format(
1059            Html2Txt                       html2Txt,
1060            int                            leftMarginWidth,
1061            Bulleting                      bulleting,
1062            int                            measure,
1063            Element                        element,
1064            Consumer<? super CharSequence> output
1065        ) throws HtmlException {
1066
1067            String text = html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes()));
1068            if (this.prefix != null) text = this.prefix.concat(text);
1069            if (this.suffix != null) text = text.concat(this.suffix);
1070
1071            if (this.emptyLineAbove) output.consume("");
1072            output.consume(text);
1073            if (this.underline != -1) output.consume(StringUtil.repeat(text.length(), (char) this.underline));
1074            if (this.emptyLineBelow) output.consume("");
1075        }
1076    }
1077
1078    /**
1079     * Simply appends the element's formatted content, a.k.a. "the tag is ignored".
1080     */
1081    private static final BlockElementFormatter
1082    IGNORE_BLOCK_ELEMENT_FORMATTER = new IndentingBlockElementFormatter(0);
1083
1084    /**
1085     * Does <i>nothing</i>, i.e. even its contents is ignored.
1086     */
1087    private static final BlockElementFormatter NOP_BLOCK_ELEMENT_FORMATTER = new BlockElementFormatter() {
1088
1089        @Override public void
1090        format(
1091            Html2Txt                       html2Txt,
1092            int                            leftMarginWidth,
1093            Bulleting                      bulleting,
1094            int                            measure,
1095            Element                        element,
1096            Consumer<? super CharSequence> output
1097        ) {
1098            ;
1099        }
1100    };
1101
1102    private static final BlockElementFormatter
1103    NYI_BLOCK_ELEMENT_FORMATTER = new BlockElementFormatter() {
1104
1105        @Override public void
1106        format(
1107            Html2Txt                       html2Txt,
1108            int                            leftMarginWidth,
1109            Bulleting                      bulleting,
1110            int                            measure,
1111            Element                        element,
1112            Consumer<? super CharSequence> output
1113        ) throws HtmlException {
1114
1115            html2Txt.htmlErrorHandler.warning(
1116                new HtmlException(
1117                    element,
1118                    "HTML block element \"<" + element.getNodeName() + ">\" is not yet implemented and thus ignored"
1119                )
1120            );
1121
1122            Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER.format(
1123                html2Txt,
1124                leftMarginWidth,
1125                bulleting,
1126                measure,
1127                element,
1128                output
1129            );
1130        }
1131    };
1132
1133    public static
1134    class IndentingBlockElementFormatter implements BlockElementFormatter {
1135
1136        private final int indentation;
1137
1138        public IndentingBlockElementFormatter(int indentation) { this.indentation = indentation; }
1139
1140        @Override public void
1141        format(
1142            Html2Txt                       html2Txt,
1143            int                            leftMarginWidth,
1144            Bulleting                      bulleting,
1145            int                            measure,
1146            Element                        element,
1147            Consumer<? super CharSequence> output
1148        ) throws HtmlException {
1149
1150            html2Txt.formatBlocks(
1151                leftMarginWidth + this.indentation,
1152                Bulleting.NONE,
1153                Bulleting.NONE,
1154                measure - this.indentation,
1155                XmlUtil.iterable(element.getChildNodes()),
1156                output
1157            );
1158        }
1159    }
1160
1161    /**
1162     * Defines the strategies for formatting HTML block elements.
1163     * <p>
1164     *   To see the HTML block elements and how they are formatted, click the word "{@code ALL_BLOCK_ELEMENTS}"
1165     *   (right above). The right hand side of the mapping means:
1166     *   <dl>
1167     *     <dt>{@link Html2Txt#NYI_BLOCK_ELEMENT_FORMATTER NYI_BLOCK_ELEMENT_FORMATTER}</dt>
1168     *     <dd>
1169     *       Issues a "Not yet implemented" warning.
1170     *     </dd>
1171     *     <dt>{@link Html2Txt#IGNORE_BLOCK_ELEMENT_FORMATTER IGNORE_BLOCK_ELEMENT_FORMATTER}</dt>
1172     *     <dd>
1173     *       The element is simply replaced with its content (a.k.a. "the element is ignored").
1174     *     </dd>
1175     *     <dt>{@code new} {@link IndentingBlockElementFormatter IndentingBlockElementFormatter(<var>N</var>)}</dt>
1176     *     <dd>
1177     *       The block is formatted <var>N</var> characters indented, relative to the enclosing block.
1178     *     </dd>
1179     *     <dt>(Other)</dt>
1180     *     <dd>
1181     *       This HTML block element is formatted specially; see the respective field documentation on this page (e.g.
1182     *       {@link #OL_FORMATTER}).
1183     *     </dd>
1184     *   </dl>
1185     * </p>
1186     */
1187    protected static final Map<String, BlockElementFormatter>
1188    ALL_BLOCK_ELEMENTS = Collections.unmodifiableMap(CollectionUtil.<String, BlockElementFormatter>map(
1189        "address",    new IndentingBlockElementFormatter(2),
1190        "article",    Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1191        "aside",      Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1192        "audio",      Html2Txt.NYI_BLOCK_ELEMENT_FORMATTER,
1193        "blockquote", new IndentingBlockElementFormatter(2),
1194        "canvas",     Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1195        "dd",         new IndentingBlockElementFormatter(4),
1196        "div",        Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1197        "dl",         new IndentingBlockElementFormatter(2),
1198        "dt",         Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1199        "fieldset",   Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1200        "figcaption", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1201        "figure",     Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1202        "footer",     Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1203        "form",       Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1204        "h1",         new HeadingBlockElementFormatter(true, '*', true),
1205        "h2",         new HeadingBlockElementFormatter(true, '=', true),
1206        "h3",         new HeadingBlockElementFormatter(true, '-', true),
1207        "h4",         new HeadingBlockElementFormatter("=== ", " ==="),
1208        "h5",         new HeadingBlockElementFormatter("== ",  " =="),
1209        "h6",         new HeadingBlockElementFormatter("= ",   " ="),
1210        "header",     Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1211        "hgroup",     Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1212        "hr",         Html2Txt.HR_FORMATTER,
1213        "li",         Html2Txt.LI_FORMATTER,
1214        "main",       Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1215        "nav",        Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1216        "noscript",   Html2Txt.NOP_BLOCK_ELEMENT_FORMATTER,
1217        "ol",         Html2Txt.OL_FORMATTER,
1218        "output",     Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1219        "p",          Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1220        "pre",        Html2Txt.PRE_FORMATTER,
1221        "section",    Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1222        "table",      Html2Txt.TABLE_FORMATTER,
1223        "tfoot",      Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER,
1224        "ul",         Html2Txt.UL_FORMATTER,
1225        "video",      Html2Txt.NYI_BLOCK_ELEMENT_FORMATTER
1226    ));
1227
1228    /**
1229     * HTML (Hypertext Markup Language) elements are usually "inline" elements or "block-level" elements.
1230     * <p>
1231     *   An inline element occupies only the space bounded by the tags that define the inline element.
1232     * </p>
1233     *
1234     * <p>
1235     *   Quoted from <a href="https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elemente">Mozilla Developer
1236     *   Network, "Inline Elements"</a>.
1237     * </p>
1238     * <p>
1239     *   See <a href="http://www.w3schools.com/html/html_blocks.asp">HTML Tutorial, section "HTML Block Elements"</a>.
1240     * </p>
1241     */
1242    private static boolean
1243    isInlineElement(Node node) {
1244
1245        if (node.getNodeType() != Node.ELEMENT_NODE) return false;
1246        Element e = (Element) node;
1247
1248        return Html2Txt.ALL_INLINE_ELEMENTS.containsKey(e.getTagName());
1249    }
1250
1251    /**
1252     * Formats "{@code <a href="...">...</a>}" and "{@code <a name="..." />}".
1253     */
1254    private static final InlineElementFormatter
1255    A_FORMATTER = new InlineElementFormatter() {
1256
1257        @Override public void
1258        format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException {
1259            String name = element.getAttribute("name");
1260            String href = element.getAttribute("href");
1261            if (!name.isEmpty() && href.isEmpty()) {
1262                if (!html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes())).isEmpty()) {
1263                    html2Txt.htmlErrorHandler.warning(
1264                        new HtmlException(element, "'<a name=\"...\" />' tag should not have content")
1265                    );
1266                }
1267
1268                // '<a name="..." />' renders as "".
1269                ;
1270            } else
1271            if (!href.isEmpty() && name.isEmpty()) {
1272                output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes())));
1273                output.append(" (see \"").append(href).append("\")");
1274            } else
1275            {
1276                html2Txt.htmlErrorHandler.warning(
1277                    new HtmlException(element, "\"<a>\" tag has an unexpected combination of attributes")
1278                );
1279            }
1280        }
1281    };
1282
1283    private static final InlineElementFormatter
1284    ABBR_FORMATTER = new InlineElementFormatter() {
1285
1286        @Override public void
1287        format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException {
1288
1289            output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes())));
1290
1291            String title = element.getAttribute("title");
1292            if (!title.isEmpty()) {
1293                output.append(" (\"").append(title).append("\")");
1294            }
1295        }
1296    };
1297
1298    private static final InlineElementFormatter
1299    BR_FORMATTER = new InlineElementFormatter() {
1300
1301        @Override public void
1302        format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException {
1303
1304            if (element.hasChildNodes()) {
1305                html2Txt.htmlErrorHandler.warning(
1306                    new HtmlException(element, "\"<br>\" tag should not have subelements nor contain text")
1307                );
1308            }
1309            output.append('\n');
1310        }
1311    };
1312
1313    private static final InlineElementFormatter
1314    IMG_FORMATTER = new InlineElementFormatter() {
1315
1316        @Override public void
1317        format(Html2Txt html2Txt, Element element, StringBuilder output) {
1318
1319            output.append("[IMG]");
1320        }
1321    };
1322
1323    private static final InlineElementFormatter
1324    INPUT_FORMATTER = new InlineElementFormatter() {
1325
1326        @Override public void
1327        format(Html2Txt html2Txt, Element element, StringBuilder output) {
1328
1329            String type = element.getAttribute("type");
1330            if ("checkbox".equals(type)) {
1331                output.append("checked".equals(element.getAttribute("checked")) ? "[x]" : "[ ]");
1332            } else
1333            if ("hidden".equals(type)) {
1334                ;
1335            } else
1336            if ("password".equals(type)) {
1337                output.append("[******]");
1338            } else
1339            if ("radio".equals(type)) {
1340                output.append("checked".equals(element.getAttribute("checked")) ? "(o)" : "( )");
1341            } else
1342            if ("submit".equals(type)) {
1343                String label = element.getAttribute("value");
1344                if (label.isEmpty()) label = "Submit";
1345                output.append("[ ").append(label).append(" ]");
1346            } else
1347            if ("text".equals(type) || "".equals(type)) {
1348                output.append('[').append(element.getAttribute("value")).append(']');
1349            } else
1350            {
1351                output.append('[').append(type.toUpperCase()).append("-INPUT]");
1352            }
1353        }
1354    };
1355
1356    private static final InlineElementFormatter
1357    Q_FORMATTER = new InlineElementFormatter() {
1358
1359        @Override public void
1360        format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException {
1361
1362            final String cite = element.getAttribute("cite");
1363
1364            output.append('"');
1365            output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes())));
1366            output.append("\"");
1367            if (!cite.isEmpty()) output.append(" (").append(cite).append(')');
1368        }
1369    };
1370
1371    /**
1372     * Simply appends the element's formatted content, a.k.a. "ignoring a tag".
1373     */
1374    private static final InlineElementFormatter
1375    IGNORE_INLINE_ELEMENT_FORMATTER = new SimpleInlineElementFormatter("", "");
1376
1377    /**
1378     * Concatenates the <var>prefix</var>, the element's formatted content, and the <var>suffix</var>.
1379     */
1380    static
1381    class SimpleInlineElementFormatter implements InlineElementFormatter {
1382
1383        private final String prefix, suffix;
1384
1385        /**
1386         * Formats enclosed text by prepending the <var>prefix</var> and appending the <var>suffix</var> to it.
1387         */
1388        public
1389        SimpleInlineElementFormatter(String prefix, String suffix) {
1390            this.prefix = prefix;
1391            this.suffix = suffix;
1392        }
1393
1394        @Override public void
1395        format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException {
1396
1397            output.append(this.prefix);
1398            output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes())));
1399            output.append(this.suffix);
1400        }
1401    }
1402
1403    private static final InlineElementFormatter
1404    NYI_INLINE_ELEMENT_FORMATTER = new InlineElementFormatter() {
1405
1406        @Override public void
1407        format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException {
1408
1409            html2Txt.htmlErrorHandler.warning(
1410                new HtmlException(
1411                    element,
1412                    "HTML inline element \"<" + element.getNodeName() + ">\" is not yet implemented and thus ignored"
1413                )
1414            );
1415
1416            output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes())));
1417        }
1418    };
1419
1420    /**
1421     * Defines the strategies for formatting HTML inline elements.
1422     * <p>
1423     *   To see the HTML inline elements and how they are formatted, click the word "{@code ALL_INLINE_ELEMENTS}"
1424     *   (right above). The right hand side of the mapping means:
1425     *   <dl>
1426     *     <dt>{@link #NYI_INLINE_ELEMENT_FORMATTER}</dt>
1427     *     <dd>
1428     *       Issues a "Not yet implemented" warning.
1429     *     </dd>
1430     *     <dt>{@link #IGNORE_INLINE_ELEMENT_FORMATTER}</dt>
1431     *     <dd>
1432     *       The element is simply replaced with its content (a.k.a. "the element is ignored").
1433     *     </dd>
1434     *     <dt>{@code new} {@link Html2Txt.SimpleInlineElementFormatter SimpleInlineElementFormatter("foo", "bar")}</dt>
1435     *     <dd>
1436     *       The element is replaced with "{@code foo}", the element content, and "{@code bar}".
1437     *     </dd>
1438     *     <dt>(Other)</dt>
1439     *     <dd>
1440     *       This HTML inline element is formatted specially; see the respective field documentation on this page (e.g.
1441     *       {@link #A_FORMATTER}).
1442     *     </dd>
1443     *   </dl>
1444     * </p>
1445     */
1446    protected static final Map<String, InlineElementFormatter>
1447    ALL_INLINE_ELEMENTS = CollectionUtil.<String, InlineElementFormatter>map(
1448        "a",        Html2Txt.A_FORMATTER,
1449        "abbr",     Html2Txt.ABBR_FORMATTER,
1450        "acronym",  Html2Txt.ABBR_FORMATTER,
1451        "b",        new SimpleInlineElementFormatter("*", "*"),
1452        "bdo",      Html2Txt.NYI_INLINE_ELEMENT_FORMATTER,
1453        "big",      Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1454        "br",       Html2Txt.BR_FORMATTER,
1455        "button",   new SimpleInlineElementFormatter("[ ", " ]"),
1456        "cite",     Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1457        "code",     Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1458        "dfn",      Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1459        "em",       new SimpleInlineElementFormatter("<", ">"),
1460        "i",        new SimpleInlineElementFormatter("<", ">"),
1461        "img",      Html2Txt.IMG_FORMATTER,
1462        "input",    Html2Txt.INPUT_FORMATTER,
1463        "kbd",      new SimpleInlineElementFormatter("[ ", " ]"),
1464        "label",    Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1465        "map",      Html2Txt.NYI_INLINE_ELEMENT_FORMATTER,
1466        "object",   Html2Txt.NYI_INLINE_ELEMENT_FORMATTER,
1467        "q",        Html2Txt.Q_FORMATTER,
1468        "samp",     Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1469        "script",   Html2Txt.NYI_INLINE_ELEMENT_FORMATTER,
1470        "select",   new SimpleInlineElementFormatter("[ ", " ]"),
1471        "small",    Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1472        "span",     Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1473        "strong",   new SimpleInlineElementFormatter("*", "*"),
1474        "sub",      Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1475        "sup",      new SimpleInlineElementFormatter("^", ""),
1476        "textarea", new SimpleInlineElementFormatter("[ ", " ]"),
1477        "tt",       Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER,
1478        "u",        new SimpleInlineElementFormatter("_", "_"),
1479        "var",      new SimpleInlineElementFormatter("<", ">")
1480    );
1481
1482    /**
1483     * Wraps the given <var>delegate</var> such that it right-pads the products with <var>c</var> to the given
1484     * <var>width</var>.
1485     */
1486    public static Producer<? extends String>
1487    rightPad(final Producer<? extends CharSequence> delegate, final int width, final char c) {
1488
1489        return new Producer<String>() {
1490
1491            @Override @Nullable public String
1492            produce() {
1493                CharSequence cs = delegate.produce();
1494                if (cs == null) return null;
1495                return (
1496                    cs.length() < width
1497                    ? cs + StringUtil.repeat(width - cs.length(), c)
1498                    : cs.toString()
1499                );
1500            }
1501        };
1502    }
1503
1504    /**
1505     * Creates and returns a {@link Consumer} that forwards its subjects to the <var>delegate</var>, with trailing
1506     * spaces ({@code ' '}) removed.
1507     */
1508    public static Consumer<CharSequence>
1509    rightTrim(final Consumer<? super String> delegate) {
1510
1511        return new Consumer<CharSequence>() {
1512
1513            @Override public void
1514            consume(CharSequence subject) {
1515
1516                int len = subject.length();
1517
1518                if (len == 0 || subject.charAt(len - 1) != ' ') {
1519                    delegate.consume(subject.toString());
1520                } else {
1521
1522                    for (len -= 2; len >= 0 && subject.charAt(len) == ' '; len--);
1523
1524                    delegate.consume(subject.toString().substring(0, len + 1));
1525                }
1526            }
1527        };
1528    }
1529}