001 002/* 003 * html2txt - Converts HTML documents to plain text 004 * 005 * Copyright (c) 2015, Arno Unkrig 006 * All rights reserved. 007 * 008 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the 009 * following conditions are met: 010 * 011 * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the 012 * following disclaimer. 013 * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the 014 * following disclaimer in the documentation and/or other materials provided with the distribution. 015 * 3. The name of the author may not be used to endorse or promote products derived from this software without 016 * specific prior written permission. 017 * 018 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 019 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 020 * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 021 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 022 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 023 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 024 * POSSIBILITY OF SUCH DAMAGE. 025 */ 026 027package de.unkrig.html2txt; 028 029import java.io.File; 030import java.io.PrintWriter; 031import java.io.Reader; 032import java.io.Writer; 033import java.nio.charset.Charset; 034import java.util.ArrayList; 035import java.util.Collections; 036import java.util.List; 037import java.util.Map; 038 039import javax.xml.parsers.DocumentBuilder; 040import javax.xml.parsers.DocumentBuilderFactory; 041import javax.xml.parsers.ParserConfigurationException; 042import javax.xml.transform.TransformerException; 043 044import org.w3c.dom.Attr; 045import org.w3c.dom.Document; 046import org.w3c.dom.Element; 047import org.w3c.dom.Node; 048import org.xml.sax.ErrorHandler; 049import org.xml.sax.InputSource; 050import org.xml.sax.Locator; 051import org.xml.sax.SAXException; 052import org.xml.sax.SAXParseException; 053 054import de.unkrig.commons.io.IoUtil; 055import de.unkrig.commons.io.LineUtil; 056import de.unkrig.commons.lang.AssertionUtil; 057import de.unkrig.commons.lang.StringUtil; 058import de.unkrig.commons.lang.protocol.Consumer; 059import de.unkrig.commons.lang.protocol.ConsumerUtil; 060import de.unkrig.commons.lang.protocol.ConsumerWhichThrows; 061import de.unkrig.commons.lang.protocol.Producer; 062import de.unkrig.commons.nullanalysis.Nullable; 063import de.unkrig.commons.text.xml.XmlUtil; 064import de.unkrig.commons.util.collections.CollectionUtil; 065 066/** 067 * A converter that turns an HTML document into plain text, using spaces and various punctuation characters to format 068 * it. 069 * <p> 070 * One important restriction is that the HTML document must be "well-formed", i.e. all opening tags must be 071 * <i>exactly</i> matched by closing tags, i.e.: 072 * </p> 073 * <pre> 074 * Let's <i>emphasize</i>. 075 * <ul> 076 * <li>List items</li> 077 * <li>must be terminated with "<tt>&lt;/li></tt>". 078 * </ul> 079 * <br /> 080 * <hr /> 081 * </pre> 082 */ 083public 084class Html2Txt { 085 086 static { AssertionUtil.enableAssertionsForThisClass(); } 087 088 /** All methods of this {@link ErrorHandler} throw the {@link SAXException} they recieve. */ 089 @SuppressWarnings("null") 090 public static final ErrorHandler 091 SIMPLE_SAX_ERROR_HANDLER = new ErrorHandler() { 092 @Override public void warning(@Nullable SAXParseException e) throws SAXParseException { throw e; } 093 @Override public void fatalError(@Nullable SAXParseException e) throws SAXParseException { throw e; } 094 @Override public void error(@Nullable SAXParseException e) throws SAXParseException { throw e; } 095 }; 096 097 /** All methods of theis {@link HtmlErrorHandler} throw the {@link HtmlException} they recieve. */ 098 public static final HtmlErrorHandler 099 SIMPLE_HTML_ERROR_HANDLER = new HtmlErrorHandler() { 100 @Override public void warning(HtmlException e) throws HtmlException { throw e; } 101 @Override public void fatalError(HtmlException e) throws HtmlException { throw e; } 102 @Override public void error(HtmlException e) throws HtmlException { throw e; } 103 }; 104 105 /** 106 * The handler for any HTML-related warnings, errors and fatal errors that may occur during conversion. 107 */ 108 HtmlErrorHandler htmlErrorHandler = Html2Txt.SIMPLE_HTML_ERROR_HANDLER; 109 110 private int pageLeftMarginWidth /*= 0*/; 111 private int pageRightMarginWidth = 1; 112 private Charset inputCharset = Charset.defaultCharset(); 113 private Charset outputCharset = Charset.defaultCharset(); 114 private int pageWidth; 115 116 { 117 try { 118 this.pageWidth = Integer.parseInt(System.getenv("COLUMNS")); 119 } catch (Exception e) { 120 this.pageWidth = 80; 121 } 122 } 123 124 /** 125 * Representation of an exceptional condition that occurred during HTML processing. This exception is always 126 * related to a node in the HTML DOM. 127 */ 128 public static 129 class HtmlException extends Exception { 130 131 private static final long serialVersionUID = 1L; 132 133 private final Node node; 134 135 public 136 HtmlException(Node node, String message) { 137 super(message); 138 this.node = node; 139 } 140 141 @Override public String 142 toString() { 143 144 String s = this.getClass().getName(); 145 146 { 147 Locator l = XmlUtil.getLocation(this.node); 148 if (l != null) { 149 String publicId = l.getPublicId(); 150 if (publicId != null) s += ", " + publicId; 151 s += ", line " + l.getLineNumber() + ", column " + l.getColumnNumber(); 152 } 153 } 154 155 { 156 String message = this.getLocalizedMessage(); 157 if (message != null) s += ": " + message; 158 } 159 160 return s; 161 } 162 } 163 164 /** Handles {@link HtmlException}s. */ 165 public 166 interface HtmlErrorHandler { 167 // SUPPRESS CHECKSTYLE JavadocMethod:3 168 void warning(HtmlException e) throws HtmlException; 169 void fatalError(HtmlException e) throws HtmlException; 170 void error(HtmlException e) throws HtmlException; 171 } 172 173 /** 174 * Formats an HTML block element. 175 * 176 * @see Html2Txt#ALL_BLOCK_ELEMENTS 177 */ 178 public 179 interface BlockElementFormatter { 180 181 /** 182 * Appends lines to the <var>output</var>. The first <var>leftMarginWidth</var> characters of each produced 183 * line are spaces (except for the first line, where the string produced by {@link Html2Txt.Bulleting#next()} 184 * is placed in the left margin), followed by up to <var>measure</var> characters. 185 */ 186 void 187 format( 188 Html2Txt html2Txt, 189 int leftMarginWidth, 190 Bulleting bulleting, 191 int measure, 192 Element element, 193 Consumer<? super CharSequence> output 194 ) throws HtmlException; 195 } 196 197 /** 198 * Formats an HTML inline element. 199 * 200 * @see Html2Txt#ALL_INLINE_ELEMENTS 201 */ 202 public 203 interface InlineElementFormatter { 204 205 /** 206 * Appends characters to the <var>output</var>; "{@code \n}" represents a "break" ("{@code <br />}"). 207 */ 208 void format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException; 209 } 210 211 interface Bulleting { 212 213 /** 214 * @return The text for the "next" bullet, e.g. "7.", "G.", "vii." 215 */ 216 String next(); 217 218 /** 219 * {@link #next()} always returns the empty string. 220 */ 221 Bulleting NONE = new Bulleting() { @Override public String next() { return ""; } }; 222 } 223 224 enum NumberingType { 225 226 /** 227 * <dl> 228 * <dt>0</dt><dd>({@code NumberFormatException})</dd> 229 * <dt>1</dt><dd>"{@code a}"</dd> 230 * <dt>2</dt><dd>"{@code b}"</dd> 231 * <dt>26</dt><dd>"{@code z}"</dd> 232 * <dt>27</dt><dd>"{@code aa}"</dd> 233 * <dt>28</dt><dd>"{@code ab}"</dd> 234 * <dt>702</dt><dd>"{@code zz}"</dd> 235 * <dt>703</dt><dd>"{@code aaa}"</dd> 236 * </dl> 237 * Etc. 238 */ 239 LOWERCASE_LETTERS { 240 241 @Override public long 242 parse(String s) { 243 long result = 0; 244 for (int i = 0; i < s.length(); i++) { 245 char c = s.charAt(i); 246 if (c >= 'A' && c <= 'Z') { 247 result = 26 * result + (c - 'A') + 1; 248 } else 249 if (c >= 'a' && c <= 'z') { 250 result = 26 * result + (c - 'a') + 1; 251 } else 252 { 253 throw new NumberFormatException(); 254 } 255 } 256 return result; 257 } 258 259 @Override public String 260 toString(long value) { 261 262 if (value < 0) return '-' + this.toString(-value); 263 if (value == 0) throw new NumberFormatException(); 264 if (value <= 26) return String.valueOf((char) (value + 'a' - 1)); 265 return this.toString(value / 26) + ((char) ((value % 26) + 'a' - 1)); 266 } 267 }, 268 269 /** 270 * <dl> 271 * <dt>0</dt><dd>({@code NumberFormatException})</dd> 272 * <dt>1</dt><dd>"{@code A}"</dd> 273 * <dt>2</dt><dd>"{@code B}"</dd> 274 * <dt>26</dt><dd>"{@code Z}"</dd> 275 * <dt>27</dt><dd>"{@code AA}"</dd> 276 * <dt>28</dt><dd>"{@code AB}"</dd> 277 * <dt>702</dt><dd>"{@code ZZ}"</dd> 278 * <dt>703</dt><dd>"{@code AAA}"</dd> 279 * </dl> 280 * Etc. 281 */ 282 UPPERCASE_LETTERS { 283 284 @Override public long parse(String s) { return LOWERCASE_LETTERS.parse(s); } 285 286 @Override public String 287 toString(long value) { 288 289 if (value < 0) return '-' + this.toString(-value); 290 if (value == 0) throw new NumberFormatException(); 291 if (value <= 26) return String.valueOf((char) (value + 'A' - 1)); 292 return this.toString(value / 26) + ((char) ((value % 26) + 'A' - 1)); 293 } 294 }, 295 296 /** 297 * <dl> 298 * <dt>0</dt><dd>({@code NumberFormatException})</dd> 299 * <dt>1</dt><dd>"{@code i}"</dd> 300 * <dt>2</dt><dd>"{@code ii}"</dd> 301 * <dt>3</dt><dd>"{@code iii}"</dd> 302 * <dt>4</dt><dd>"{@code iv}"</dd> 303 * <dt>9999</dt><dd>"{@code mmmmmmmmmcmlcix}"</dd> 304 * <dt>10000</dt><dd>({@code NumberFormatException})</dd> 305 * </dl> 306 * Etc. 307 */ 308 LOWERCASE_ROMAN_NUMERALS { 309 310 private final String[][] ds = { 311 " i ii iii iv v vi vii viii ix".split(" "), 312 " x xx xxx xl l lx lxx lxxx lc".split(" "), 313 " c cc ccc cd d dc dcc dccc cm".split(" "), 314 " m mm mmm mmmm mmmmm mmmmmm mmmmmmm mmmmmmmm mmmmmmmmm".split(" "), 315 }; 316 317 @Override public long 318 parse(String s) { 319 if (s.isEmpty()) throw new NumberFormatException(); 320 s = s.toLowerCase(); 321 322 long result = 0; 323 for (int i = 3; i >= 0; i--) { 324 for (int j = 9;; j--) { 325 String d = this.ds[i][j]; 326 if (s.startsWith(d)) { 327 result = 10 * result + j; 328 break; 329 } 330 } 331 } 332 return result; 333 } 334 335 @Override public String 336 toString(long value) { 337 if (value == 0) throw new NumberFormatException(); 338 if (value < 0) return '-' + this.toString(-value); 339 if (value >= 10000) throw new NumberFormatException(); 340 341 if (value <= 9) return this.ds[0][(int) value]; 342 StringBuilder sb = new StringBuilder(); 343 if (value >= 1000) { 344 sb.append(this.ds[3][(int) value / 1000]); 345 value %= 1000; 346 } 347 if (value >= 100) { 348 sb.append(this.ds[2][(int) value / 100]); 349 value %= 100; 350 } 351 if (value >= 10) { 352 sb.append(this.ds[1][(int) value / 10]); 353 value %= 10; 354 } 355 if (value >= 1) { 356 sb.append(this.ds[0][(int) value]); 357 } 358 return sb.toString(); 359 } 360 }, 361 362 /** 363 * <dl> 364 * <dt>0</dt><dd>({@code NumberFormatException})</dd> 365 * <dt>1</dt><dd>"{@code I}"</dd> 366 * <dt>2</dt><dd>"{@code II}"</dd> 367 * <dt>3</dt><dd>"{@code III}"</dd> 368 * <dt>4</dt><dd>"{@code IV}"</dd> 369 * <dt>9999</dt><dd>"{@code MMMMMMMMMCMLCIX}"</dd> 370 * <dt>10000</dt><dd>({@code NumberFormatException})</dd> 371 * </dl> 372 * Etc. 373 */ 374 UPPERCASE_ROMAN_LITERALS { 375 @Override public long parse(String s) { return NumberingType.LOWERCASE_ROMAN_NUMERALS.parse(s); } 376 @Override public String toString(long value) { return LOWERCASE_ROMAN_NUMERALS.toString().toUpperCase(); } 377 }, 378 379 /** 380 * @see Long#parseLong(String) 381 * @see Long#toString(long) 382 */ 383 ARABIC_DIGITS { 384 @Override public long parse(String s) { return Long.parseLong(s); } 385 @Override public String toString(long value) { return Long.toString(value); } 386 }; 387 388 /** 389 * Converts the given string to an integral value. 390 */ 391 public abstract long parse(String s); 392 393 /** 394 * Converts the given integral value to a string. Notice that some {@link NumberingType}s do not support the 395 * value zero, or numbers greater than 9999. 396 */ 397 public abstract String toString(long value); 398 } 399 400 /** 401 * Sets a custom {@link HtmlErrorHandler} on this object. The default handler is {@link 402 * #SIMPLE_HTML_ERROR_HANDLER}. 403 */ 404 public Html2Txt 405 setErrorHandler(HtmlErrorHandler htmlErrorHandler) { 406 this.htmlErrorHandler = htmlErrorHandler; 407 return this; 408 } 409 410 /** 411 * The number of spaces that preceeds each line of output; defaults to zero. 412 */ 413 public Html2Txt 414 setPageLeftMarginWidth(int pageLeftMarginWidth) { 415 this.pageLeftMarginWidth = pageLeftMarginWidth; 416 return this; 417 } 418 419 /** 420 * The maximum length of output lines is "<var>pageWidth</var> - <var>rightMarginWidth</var>". 421 * <p> 422 * Defaults to "{@code 1}", to avoid extra line wraps on certain terminals. 423 * </p> 424 * 425 * @see #setPageWidth(int) 426 */ 427 public Html2Txt 428 setPageRightMarginWidth(int pageRightMarginWidth) { 429 this.pageRightMarginWidth = pageRightMarginWidth; 430 return this; 431 } 432 433 /** 434 * Sets the charset to use when reading HTML input files. Defaults to the {@link Charset#defaultCharset() JVM 435 * default charset}. 436 */ 437 public void 438 setInputCharset(Charset cs) { 439 this.inputCharset = cs; 440 } 441 442 /** 443 * Sets the charset to use when writing text input files. Defaults to the {@link Charset#defaultCharset() JVM 444 * default charset}. 445 */ 446 public void 447 setOutputCharset(Charset cs) { 448 this.outputCharset = cs; 449 } 450 451 /** 452 * The maximum length of output lines is "<var>pageWidth</var> - <var>rightMarginWidth</var>". 453 * <p> 454 * Defaults to the value of the environment variable "{@code $COLUMNS}", or, if that is not set, to 80. 455 * </p> 456 * 457 * @see #setPageRightMarginWidth(int) 458 */ 459 public Html2Txt 460 setPageWidth(int pageWidth) { this.pageWidth = pageWidth; return this; } 461 462 /** 463 * Reads, scans and parses the HTML document in the {@code inputFile}, generates a plain text document, and 464 * writes it to the {@code output}. 465 */ 466 public void 467 html2txt(File inputFile, Writer output) 468 throws ParserConfigurationException, SAXException, TransformerException, HtmlException { 469 470 DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder(); 471 db.setErrorHandler(Html2Txt.SIMPLE_SAX_ERROR_HANDLER); 472 473 Document document = XmlUtil.parse(db, inputFile, this.inputCharset.name()); 474 475 this.html2txt(document, output); 476 } 477 478 /** 479 * Reads, scans and parses the HTML document in the {@code inputFile}, generates a plain text document, and 480 * writes it to the {@code output}. 481 */ 482 public void 483 html2txt(Reader input, Writer output) 484 throws ParserConfigurationException, SAXException, TransformerException, HtmlException { 485 486 DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder(); 487 db.setErrorHandler(Html2Txt.SIMPLE_SAX_ERROR_HANDLER); 488 489 InputSource inputSource = new InputSource(); 490 inputSource.setCharacterStream(input); 491 492 Document document = XmlUtil.parse(db, inputSource); 493 494 this.html2txt(document, output); 495 } 496 497 /** 498 * Generates a plain text document from the given HTML <var>document</var>, and writes it to the {@code output}. 499 */ 500 public void 501 html2txt(final Document document, Writer output) throws HtmlException { 502 503 document.getDocumentElement().normalize(); 504 505 PrintWriter pw = output instanceof PrintWriter ? (PrintWriter) output : new PrintWriter(output); 506 507 this.html2txt(document, LineUtil.lineConsumer(pw)); 508 } 509 510 /** 511 * Reads, scans and parses the HTML document in the {@code inputFile}, generates a plain text document, and 512 * writes it to the {@code outputFile}. 513 */ 514 public void 515 html2txt(final File inputFile, File outputFile) throws Exception { 516 517 IoUtil.outputFilePrintWriter( 518 outputFile, 519 this.outputCharset, 520 new ConsumerWhichThrows<PrintWriter, Exception>() { 521 522 @Override public void 523 consume(PrintWriter pw) throws Exception { Html2Txt.this.html2txt(inputFile, pw); } 524 } 525 ); 526 } 527 528 private void 529 html2txt(Document document, Consumer<? super CharSequence> output) throws HtmlException { 530 531 // Some block tags render vertical space, which we want to compress. 532 output = ConsumerUtil.<CharSequence>compress(output, StringUtil.IS_BLANK, ""); 533 534 // Some formatters render trailing spaces (esp. the TABLE_FORMATTER), which we also want to suppress. 535 output = Html2Txt.rightTrim(output); 536 537 Element documentElement = document.getDocumentElement(); 538 539 // Iff the document is structured like 540 // 541 // <html> 542 // ... 543 // <body>...</body> 544 // ... 545 // <body>...</body> 546 // ... 547 // </html> 548 // ... 549 // 550 // , then the result is the formatted <body>s. 551 if ("html".equals(documentElement.getNodeName())) { 552 for (Node n : XmlUtil.iterable(documentElement.getChildNodes())) { 553 if (n.getNodeType() == Node.ELEMENT_NODE && "body".equals(n.getNodeName())) { 554 Element bodyElement = (Element) n; 555 this.formatBlocks( 556 this.pageLeftMarginWidth, 557 Bulleting.NONE, 558 Bulleting.NONE, 559 this.pageWidth - this.pageLeftMarginWidth - this.pageRightMarginWidth, 560 XmlUtil.iterable(bodyElement.getChildNodes()), 561 output 562 ); 563 } 564 } 565 566 return; 567 } 568 569 // Otherwise, assume that the document poses an HTML *fragment*, and the top level nodes ar *blocks*. 570 this.formatBlocks( 571 this.pageLeftMarginWidth, 572 Bulleting.NONE, 573 Bulleting.NONE, 574 this.pageWidth - this.pageLeftMarginWidth - this.pageRightMarginWidth, 575 Collections.singletonList(documentElement), 576 output 577 ); 578 } 579 580 /** 581 * Formats a sequence of {@link Node#TEXT_NODE TEXT} nodes and HTML inline or block {@link Node#ELEMENT_NODE 582 * ELEMENT} nodes. 583 */ 584 <N extends Node> void 585 formatBlocks( 586 int leftMarginWidth, 587 Bulleting inlineSubelementsBulleting, 588 Bulleting blockSubelementsBulleting, 589 int measure, 590 Iterable<N> nodes, 591 Consumer<? super CharSequence> output 592 ) throws HtmlException { 593 594 List<Node> inlineNodes = new ArrayList<Node>(); 595 for (Node n : nodes) { 596 if (n.getNodeType() == Node.TEXT_NODE) { 597 inlineNodes.add(n); 598 } else 599 if (Html2Txt.isInlineElement(n)) { 600 inlineNodes.add(n); 601 } else 602 if (Html2Txt.isBlockElement(n)) { 603 if (!inlineNodes.isEmpty()) { 604 this.wordWrap( 605 leftMarginWidth, 606 inlineSubelementsBulleting, 607 measure, 608 this.getBlock(inlineNodes), 609 output 610 ); 611 inlineNodes.clear(); 612 } 613 614 Element e = (Element) n; 615 616 BlockElementFormatter bef = Html2Txt.ALL_BLOCK_ELEMENTS.get(e.getTagName()); 617 if (bef == null) { 618 this.htmlErrorHandler.error( 619 new HtmlException(n, "Unexpected block element \"" + XmlUtil.toString(e) + "\" in block") 620 ); 621 } else { 622 bef.format(this, leftMarginWidth, blockSubelementsBulleting, measure, e, output); 623 } 624 } else 625 { 626 this.htmlErrorHandler.error( 627 new HtmlException(n, "Unexpected node \"" + XmlUtil.toString(n) + "\" in <body>") 628 ); 629 } 630 } 631 632 if (!inlineNodes.isEmpty()) { 633 this.wordWrap( 634 leftMarginWidth, 635 inlineSubelementsBulleting, 636 measure, 637 this.getBlock(inlineNodes), 638 output 639 ); 640 inlineNodes.clear(); 641 } 642 } 643 644 /** 645 * The given <var>text</var> is word-wrapped such that each output line begins with <var>leftMarginWidth</var> 646 * spaces, followed by up to <var>measure</var> characters. If the <var>text</var> contains very long words, then 647 * some of the output lines may be longer than "<var>leftMarginWidth</var> + <var>measure</var>". 648 * <p> 649 * Newline characters ({@code '\n'}) appear as line breaks in the output. 650 * </p> 651 * <p> 652 * The output lines are fed to the <var>lc</var>. 653 * </p> 654 * @param bulleting The string produced by {@link Bulleting#next()} is placed in the left margin of the first 655 * line generated 656 */ 657 private void 658 wordWrap( 659 int leftMarginWidth, 660 Bulleting bulleting, 661 int measure, 662 String text, 663 Consumer<? super CharSequence> output 664 ) throws HtmlException { 665 666 text = text.trim(); 667 if (text.length() == 0) return; 668 669 if (measure < 1) measure = 1; 670 671 // From this point on, the first letter of "text" is always a non-space character. 672 673 for (int nlidx = text.indexOf('\n'); nlidx != -1; nlidx = text.indexOf('\n')) { 674 this.wordWrap(leftMarginWidth, bulleting, measure, text.substring(0, nlidx), output); 675 for (nlidx++; nlidx < text.length() && text.charAt(nlidx) == ' '; nlidx++); 676 if (nlidx == text.length()) return; 677 text = text.substring(nlidx); 678 } 679 680 String continuationLineLeftMargin = StringUtil.repeat(leftMarginWidth, ' '); 681 String leftMargin; 682 { 683 String bullet = bulleting.next(); 684 if (bullet.length() == 0) { 685 leftMargin = continuationLineLeftMargin; 686 } else 687 if (bullet.length() + 1 < leftMarginWidth) { 688 leftMargin = StringUtil.repeat(leftMarginWidth - bullet.length() - 1, ' ') + bullet + ' '; 689 } else 690 { 691 leftMargin = bullet + ' '; 692 } 693 } 694 695 for (;;) { 696 697 if (text.length() <= measure) break; 698 699 // Determine the point to wrap at. 700 int idx1; // Space after the last word to keep in THIS line. 701 int idx2; // First letter of the first word to put on the NEXT line. 702 IDXS: 703 if (text.charAt(measure) == ' ') { 704 for (idx1 = measure; idx1 > 0 && text.charAt(idx1 - 1) == ' '; idx1--); 705 for (idx2 = measure + 1; idx2 < text.length() && text.charAt(idx2) == ' '; idx2++); 706 } else 707 { 708 for (idx2 = measure; idx2 > 0 && text.charAt(idx2 - 1) != ' '; idx2--) { 709 if (text.charAt(idx2 - 1) == '-') { 710 idx1 = idx2; 711 break IDXS; 712 } 713 } 714 if (idx2 == 0) { 715 for (idx1 = measure + 1; idx1 < text.length() && text.charAt(idx1) != ' '; idx1++); 716 if (idx1 == text.length()) break; 717 for (idx2 = idx1 + 1; idx2 < text.length() && text.charAt(idx2) == ' '; idx2++); 718 if (idx2 == text.length()) { 719 text = text.substring(0, idx1); 720 break; 721 } 722 } else { 723 for (idx1 = idx2 - 1; text.charAt(idx1 - 1) == ' '; idx1--); 724 } 725 } 726 727 output.consume(leftMargin + text.substring(0, idx1)); 728 729 text = text.substring(idx2); 730 731 leftMargin = continuationLineLeftMargin; 732 } 733 734 output.consume(leftMargin + text); 735 } 736 737 /** 738 * Formats text and inline elements into one long line, except for "{@code <br />}" tags, which map into 739 * line breaks. 740 */ 741 private String 742 getBlock(Iterable<Node> nodes) throws HtmlException { 743 StringBuilder sb = new StringBuilder(); 744 745 for (Node n : nodes) { 746 short nodeType = n.getNodeType(); 747 748 if (nodeType == Node.TEXT_NODE) { 749 String content = n.getTextContent(); 750 sb.append(content.replaceAll("\\s+", " ")); 751 } else 752 if (nodeType == Node.ELEMENT_NODE) { 753 Element e = (Element) n; 754 755 InlineElementFormatter ief = Html2Txt.ALL_INLINE_ELEMENTS.get(e.getTagName()); 756 if (ief == null) { 757 this.htmlErrorHandler.error( 758 new HtmlException(n, "Unexpected element \"" + XmlUtil.toString(e) + "\" in block") 759 ); 760 } else { 761 ief.format(this, e, sb); 762 } 763 } else 764 { 765 this.htmlErrorHandler.error(new HtmlException(n, "Unexpected node in block")); 766 } 767 } 768 return sb.toString(); 769 } 770 771 /** 772 * "Block-Level" is categorization of HTML elements, as contrasted with "inline" elements. 773 * <p> 774 * Block-level elements may appear only within a {@code <body>} element. 775 * Their most significant characteristic is that they typically are formatted with a line break before and after 776 * the element (thereby creating a stand-alone block of content). That is, they take up the width of their 777 * containers. 778 * </p> 779 * <p> 780 * The distinction of block-level vs. inline elements is used in HTML specifications up to 4.01. In HTML5, this 781 * binary distinction is replaced with a more complex set of content categories. The "block-level" category 782 * roughly corresponds to the category of flow content in HTML5, while "inline" corresponds to phrasing content, 783 * but there are additional categories. 784 * </p> 785 * <p> 786 * There are a couple of key differences between block-level elements and inline elements: 787 * </p> 788 * <dl> 789 * <dt>Formatting</dt> 790 * <dd> 791 * By default, block-level elements begin on new lines. 792 * </dd> 793 * <dt>Content model</dt> 794 * <dd> 795 * Generally, block-level elements may contain inline elements and other block-level elements. Inherent in 796 * this structural distinction is the idea that block elements create "larger" structures than inline elements. 797 * </dd> 798 * </dl> 799 * <p> 800 * Quoted from <a href="https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements">Mozilla Developer 801 * Network, "Block-level Elements"</a>. 802 * </p> 803 * 804 * <p> 805 * See also <a href="http://www.w3schools.com/html/html_blocks.asp">HTML Tutorial, section "HTML Block 806 * Elements"</a>. 807 * </p> 808 * 809 * @return Whether the given {@code node} is one of the "block elements" by the HTML standard 810 */ 811 private static boolean 812 isBlockElement(Node node) { 813 814 if (node.getNodeType() != Node.ELEMENT_NODE) return false; 815 Element e = (Element) node; 816 817 return Html2Txt.ALL_BLOCK_ELEMENTS.containsKey(e.getTagName()); 818 } 819 820 /** 821 * @param tagName E.g. "{@code table}" 822 */ 823 @Nullable static Element 824 isElement(Node node, String tagName) { 825 826 if (node.getNodeType() != Node.ELEMENT_NODE) return null; 827 Element e = (Element) node; 828 829 return tagName.equals(e.getTagName()) ? e : null; 830 } 831 832 private static final BlockElementFormatter 833 HR_FORMATTER = new BlockElementFormatter() { 834 835 @Override public void 836 format( 837 Html2Txt html2Txt, 838 int leftMarginWidth, 839 Bulleting bulleting, 840 int measure, 841 Element element, 842 Consumer<? super CharSequence> output 843 ) { 844 output.consume(StringUtil.repeat(leftMarginWidth, ' ') + StringUtil.repeat(measure, '-')); 845 } 846 }; 847 848 /** 849 * Formatter for the "{@code <ol>}" ("ordered list") HTML block element. 850 */ 851 protected static final BlockElementFormatter 852 OL_FORMATTER = new BlockElementFormatter() { 853 854 @Override public void 855 format( 856 Html2Txt html2Txt, 857 int leftMarginWidth, 858 Bulleting bulleting, 859 int measure, 860 Element element, 861 Consumer<? super CharSequence> output 862 ) throws HtmlException { 863 864 // Determine the OL type. 865 final NumberingType numberingType; 866 { 867 Attr s = element.getAttributeNode("type"); 868 if (s == null) { 869 numberingType = NumberingType.ARABIC_DIGITS; 870 } else { 871 String value = s.getValue(); 872 numberingType = ( 873 "a".equals(value) ? NumberingType.LOWERCASE_LETTERS : 874 "A".equals(value) ? NumberingType.UPPERCASE_LETTERS : 875 "i".equals(value) ? NumberingType.LOWERCASE_ROMAN_NUMERALS : 876 "I".equals(value) ? NumberingType.UPPERCASE_ROMAN_LITERALS : 877 NumberingType.ARABIC_DIGITS 878 ); 879 } 880 } 881 882 // Compute the index to start from. 883 final int start; 884 { 885 int tmp; 886 try { 887 tmp = Integer.parseInt(element.getAttribute("start")); 888 } catch (Exception e) { 889 tmp = 1; 890 } 891 start = tmp; 892 } 893 894 html2Txt.formatBlocks( 895 leftMarginWidth + 5, 896 Bulleting.NONE, // inlineSubelementsBulleting 897 new Bulleting() { // blockSubelementsBulleting 898 int nextValue = start; 899 @Override public String next() { return numberingType.toString(this.nextValue++) + "."; } 900 }, 901 measure - 5, 902 XmlUtil.iterable(element.getChildNodes()), 903 output 904 ); 905 } 906 }; 907 908 private static final BlockElementFormatter 909 LI_FORMATTER = new BlockElementFormatter() { 910 911 @Override public void 912 format( 913 Html2Txt html2Txt, 914 int leftMarginWidth, 915 Bulleting bulleting, 916 int measure, 917 Element element, 918 Consumer<? super CharSequence> output 919 ) throws HtmlException { 920 921 html2Txt.formatBlocks( 922 leftMarginWidth, 923 bulleting, // inlineSubelementsBulleting 924 Bulleting.NONE, // blockSubelementsBulleting 925 measure, 926 XmlUtil.iterable(element.getChildNodes()), 927 output 928 ); 929 } 930 }; 931 932 private static final BlockElementFormatter 933 PRE_FORMATTER = new BlockElementFormatter() { 934 935 @Override public void 936 format( 937 Html2Txt html2Txt, 938 int leftMarginWidth, 939 Bulleting bulleting, 940 int measure, 941 Element element, 942 Consumer<? super CharSequence> output 943 ) throws HtmlException { 944 945 StringBuilder sb = new StringBuilder(); 946 for (Node n : XmlUtil.iterable(element.getChildNodes())) { 947 short nodeType = n.getNodeType(); 948 949 if (nodeType == Node.TEXT_NODE) { 950 sb.append(n.getTextContent()); 951 } else 952 if (nodeType == Node.ELEMENT_NODE) { 953 Element e = (Element) n; 954 955 InlineElementFormatter ief = Html2Txt.ALL_INLINE_ELEMENTS.get(e.getTagName()); 956 if (ief == null) { 957 html2Txt.htmlErrorHandler.error( 958 new HtmlException(n, "Unexpected element \"" + XmlUtil.toString(e) + "\" in <pre>") 959 ); 960 } else { 961 ief.format(html2Txt, e, sb); 962 } 963 } else 964 { 965 html2Txt.htmlErrorHandler.error(new HtmlException(n, "Unexpected node in <pre>")); 966 } 967 } 968 969 Producer<? extends CharSequence> lp = LineUtil.lineProducer(sb); 970 for (boolean first = true;; first = false) { 971 972 CharSequence line = lp.produce(); 973 if (line == null) break; 974 975 // Ignore leading empty lines. 976 if (first && line.length() == 0) continue; 977 978 if (first) { 979 String bullet = bulleting.next(); 980 if (bullet.length() + 1 > leftMarginWidth) { 981 line = bullet + ' ' + line; 982 } else { 983 line = StringUtil.repeat(leftMarginWidth - bullet.length() - 1, ' ') + bullet + ' ' + line; 984 } 985 } 986 987 output.consume(line); 988 } 989 } 990 }; 991 992 /** 993 * Formatter for the "{@code <table>}" HTML block element. 994 */ 995 protected static final BlockElementFormatter 996 TABLE_FORMATTER = new TableFormatter(); 997 998 /** 999 * @return The length of the longest of the <var>css</var>, or {@code 0} iff <var>css</var> is empty 1000 */ 1001 public static int 1002 maxLength(Iterable<? extends CharSequence> css) { 1003 1004 int result = 0; 1005 for (CharSequence cs : css) { 1006 int len = cs.length(); 1007 if (len > result) result = len; 1008 } 1009 1010 return result; 1011 } 1012 1013 private static final BlockElementFormatter 1014 UL_FORMATTER = new BlockElementFormatter() { 1015 1016 @Override public void 1017 format( 1018 Html2Txt html2Txt, 1019 int leftMarginWidth, 1020 Bulleting bulleting, 1021 int measure, 1022 Element element, 1023 Consumer<? super CharSequence> output 1024 ) throws HtmlException { 1025 1026 html2Txt.formatBlocks( 1027 leftMarginWidth + 3, 1028 Bulleting.NONE, 1029 new Bulleting() { @Override public String next() { return "*"; } }, 1030 measure - 3, 1031 XmlUtil.iterable(element.getChildNodes()), 1032 output 1033 ); 1034 } 1035 }; 1036 1037 private static 1038 class HeadingBlockElementFormatter implements BlockElementFormatter { 1039 1040 private boolean emptyLineAbove, emptyLineBelow; 1041 @Nullable private String prefix, suffix; 1042 private int underline = -1; 1043 1044 public 1045 HeadingBlockElementFormatter(String prefix, String suffix) { 1046 this.prefix = prefix; 1047 this.suffix = suffix; 1048 } 1049 1050 public 1051 HeadingBlockElementFormatter(boolean emptyLineAbove, char underline, boolean emptyLineBelow) { 1052 this.emptyLineAbove = emptyLineAbove; 1053 this.underline = underline; 1054 this.emptyLineBelow = emptyLineBelow; 1055 } 1056 1057 @Override public void 1058 format( 1059 Html2Txt html2Txt, 1060 int leftMarginWidth, 1061 Bulleting bulleting, 1062 int measure, 1063 Element element, 1064 Consumer<? super CharSequence> output 1065 ) throws HtmlException { 1066 1067 String text = html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes())); 1068 if (this.prefix != null) text = this.prefix.concat(text); 1069 if (this.suffix != null) text = text.concat(this.suffix); 1070 1071 if (this.emptyLineAbove) output.consume(""); 1072 output.consume(text); 1073 if (this.underline != -1) output.consume(StringUtil.repeat(text.length(), (char) this.underline)); 1074 if (this.emptyLineBelow) output.consume(""); 1075 } 1076 } 1077 1078 /** 1079 * Simply appends the element's formatted content, a.k.a. "the tag is ignored". 1080 */ 1081 private static final BlockElementFormatter 1082 IGNORE_BLOCK_ELEMENT_FORMATTER = new IndentingBlockElementFormatter(0); 1083 1084 /** 1085 * Does <i>nothing</i>, i.e. even its contents is ignored. 1086 */ 1087 private static final BlockElementFormatter NOP_BLOCK_ELEMENT_FORMATTER = new BlockElementFormatter() { 1088 1089 @Override public void 1090 format( 1091 Html2Txt html2Txt, 1092 int leftMarginWidth, 1093 Bulleting bulleting, 1094 int measure, 1095 Element element, 1096 Consumer<? super CharSequence> output 1097 ) { 1098 ; 1099 } 1100 }; 1101 1102 private static final BlockElementFormatter 1103 NYI_BLOCK_ELEMENT_FORMATTER = new BlockElementFormatter() { 1104 1105 @Override public void 1106 format( 1107 Html2Txt html2Txt, 1108 int leftMarginWidth, 1109 Bulleting bulleting, 1110 int measure, 1111 Element element, 1112 Consumer<? super CharSequence> output 1113 ) throws HtmlException { 1114 1115 html2Txt.htmlErrorHandler.warning( 1116 new HtmlException( 1117 element, 1118 "HTML block element \"<" + element.getNodeName() + ">\" is not yet implemented and thus ignored" 1119 ) 1120 ); 1121 1122 Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER.format( 1123 html2Txt, 1124 leftMarginWidth, 1125 bulleting, 1126 measure, 1127 element, 1128 output 1129 ); 1130 } 1131 }; 1132 1133 public static 1134 class IndentingBlockElementFormatter implements BlockElementFormatter { 1135 1136 private final int indentation; 1137 1138 public IndentingBlockElementFormatter(int indentation) { this.indentation = indentation; } 1139 1140 @Override public void 1141 format( 1142 Html2Txt html2Txt, 1143 int leftMarginWidth, 1144 Bulleting bulleting, 1145 int measure, 1146 Element element, 1147 Consumer<? super CharSequence> output 1148 ) throws HtmlException { 1149 1150 html2Txt.formatBlocks( 1151 leftMarginWidth + this.indentation, 1152 Bulleting.NONE, 1153 Bulleting.NONE, 1154 measure - this.indentation, 1155 XmlUtil.iterable(element.getChildNodes()), 1156 output 1157 ); 1158 } 1159 } 1160 1161 /** 1162 * Defines the strategies for formatting HTML block elements. 1163 * <p> 1164 * To see the HTML block elements and how they are formatted, click the word "{@code ALL_BLOCK_ELEMENTS}" 1165 * (right above). The right hand side of the mapping means: 1166 * <dl> 1167 * <dt>{@link Html2Txt#NYI_BLOCK_ELEMENT_FORMATTER NYI_BLOCK_ELEMENT_FORMATTER}</dt> 1168 * <dd> 1169 * Issues a "Not yet implemented" warning. 1170 * </dd> 1171 * <dt>{@link Html2Txt#IGNORE_BLOCK_ELEMENT_FORMATTER IGNORE_BLOCK_ELEMENT_FORMATTER}</dt> 1172 * <dd> 1173 * The element is simply replaced with its content (a.k.a. "the element is ignored"). 1174 * </dd> 1175 * <dt>{@code new} {@link IndentingBlockElementFormatter IndentingBlockElementFormatter(<var>N</var>)}</dt> 1176 * <dd> 1177 * The block is formatted <var>N</var> characters indented, relative to the enclosing block. 1178 * </dd> 1179 * <dt>(Other)</dt> 1180 * <dd> 1181 * This HTML block element is formatted specially; see the respective field documentation on this page (e.g. 1182 * {@link #OL_FORMATTER}). 1183 * </dd> 1184 * </dl> 1185 * </p> 1186 */ 1187 protected static final Map<String, BlockElementFormatter> 1188 ALL_BLOCK_ELEMENTS = Collections.unmodifiableMap(CollectionUtil.<String, BlockElementFormatter>map( 1189 "address", new IndentingBlockElementFormatter(2), 1190 "article", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1191 "aside", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1192 "audio", Html2Txt.NYI_BLOCK_ELEMENT_FORMATTER, 1193 "blockquote", new IndentingBlockElementFormatter(2), 1194 "canvas", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1195 "dd", new IndentingBlockElementFormatter(4), 1196 "div", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1197 "dl", new IndentingBlockElementFormatter(2), 1198 "dt", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1199 "fieldset", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1200 "figcaption", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1201 "figure", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1202 "footer", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1203 "form", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1204 "h1", new HeadingBlockElementFormatter(true, '*', true), 1205 "h2", new HeadingBlockElementFormatter(true, '=', true), 1206 "h3", new HeadingBlockElementFormatter(true, '-', true), 1207 "h4", new HeadingBlockElementFormatter("=== ", " ==="), 1208 "h5", new HeadingBlockElementFormatter("== ", " =="), 1209 "h6", new HeadingBlockElementFormatter("= ", " ="), 1210 "header", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1211 "hgroup", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1212 "hr", Html2Txt.HR_FORMATTER, 1213 "li", Html2Txt.LI_FORMATTER, 1214 "main", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1215 "nav", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1216 "noscript", Html2Txt.NOP_BLOCK_ELEMENT_FORMATTER, 1217 "ol", Html2Txt.OL_FORMATTER, 1218 "output", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1219 "p", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1220 "pre", Html2Txt.PRE_FORMATTER, 1221 "section", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1222 "table", Html2Txt.TABLE_FORMATTER, 1223 "tfoot", Html2Txt.IGNORE_BLOCK_ELEMENT_FORMATTER, 1224 "ul", Html2Txt.UL_FORMATTER, 1225 "video", Html2Txt.NYI_BLOCK_ELEMENT_FORMATTER 1226 )); 1227 1228 /** 1229 * HTML (Hypertext Markup Language) elements are usually "inline" elements or "block-level" elements. 1230 * <p> 1231 * An inline element occupies only the space bounded by the tags that define the inline element. 1232 * </p> 1233 * 1234 * <p> 1235 * Quoted from <a href="https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elemente">Mozilla Developer 1236 * Network, "Inline Elements"</a>. 1237 * </p> 1238 * <p> 1239 * See <a href="http://www.w3schools.com/html/html_blocks.asp">HTML Tutorial, section "HTML Block Elements"</a>. 1240 * </p> 1241 */ 1242 private static boolean 1243 isInlineElement(Node node) { 1244 1245 if (node.getNodeType() != Node.ELEMENT_NODE) return false; 1246 Element e = (Element) node; 1247 1248 return Html2Txt.ALL_INLINE_ELEMENTS.containsKey(e.getTagName()); 1249 } 1250 1251 /** 1252 * Formats "{@code <a href="...">...</a>}" and "{@code <a name="..." />}". 1253 */ 1254 private static final InlineElementFormatter 1255 A_FORMATTER = new InlineElementFormatter() { 1256 1257 @Override public void 1258 format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException { 1259 String name = element.getAttribute("name"); 1260 String href = element.getAttribute("href"); 1261 if (!name.isEmpty() && href.isEmpty()) { 1262 if (!html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes())).isEmpty()) { 1263 html2Txt.htmlErrorHandler.warning( 1264 new HtmlException(element, "'<a name=\"...\" />' tag should not have content") 1265 ); 1266 } 1267 1268 // '<a name="..." />' renders as "". 1269 ; 1270 } else 1271 if (!href.isEmpty() && name.isEmpty()) { 1272 output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes()))); 1273 output.append(" (see \"").append(href).append("\")"); 1274 } else 1275 { 1276 html2Txt.htmlErrorHandler.warning( 1277 new HtmlException(element, "\"<a>\" tag has an unexpected combination of attributes") 1278 ); 1279 } 1280 } 1281 }; 1282 1283 private static final InlineElementFormatter 1284 ABBR_FORMATTER = new InlineElementFormatter() { 1285 1286 @Override public void 1287 format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException { 1288 1289 output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes()))); 1290 1291 String title = element.getAttribute("title"); 1292 if (!title.isEmpty()) { 1293 output.append(" (\"").append(title).append("\")"); 1294 } 1295 } 1296 }; 1297 1298 private static final InlineElementFormatter 1299 BR_FORMATTER = new InlineElementFormatter() { 1300 1301 @Override public void 1302 format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException { 1303 1304 if (element.hasChildNodes()) { 1305 html2Txt.htmlErrorHandler.warning( 1306 new HtmlException(element, "\"<br>\" tag should not have subelements nor contain text") 1307 ); 1308 } 1309 output.append('\n'); 1310 } 1311 }; 1312 1313 private static final InlineElementFormatter 1314 IMG_FORMATTER = new InlineElementFormatter() { 1315 1316 @Override public void 1317 format(Html2Txt html2Txt, Element element, StringBuilder output) { 1318 1319 output.append("[IMG]"); 1320 } 1321 }; 1322 1323 private static final InlineElementFormatter 1324 INPUT_FORMATTER = new InlineElementFormatter() { 1325 1326 @Override public void 1327 format(Html2Txt html2Txt, Element element, StringBuilder output) { 1328 1329 String type = element.getAttribute("type"); 1330 if ("checkbox".equals(type)) { 1331 output.append("checked".equals(element.getAttribute("checked")) ? "[x]" : "[ ]"); 1332 } else 1333 if ("hidden".equals(type)) { 1334 ; 1335 } else 1336 if ("password".equals(type)) { 1337 output.append("[******]"); 1338 } else 1339 if ("radio".equals(type)) { 1340 output.append("checked".equals(element.getAttribute("checked")) ? "(o)" : "( )"); 1341 } else 1342 if ("submit".equals(type)) { 1343 String label = element.getAttribute("value"); 1344 if (label.isEmpty()) label = "Submit"; 1345 output.append("[ ").append(label).append(" ]"); 1346 } else 1347 if ("text".equals(type) || "".equals(type)) { 1348 output.append('[').append(element.getAttribute("value")).append(']'); 1349 } else 1350 { 1351 output.append('[').append(type.toUpperCase()).append("-INPUT]"); 1352 } 1353 } 1354 }; 1355 1356 private static final InlineElementFormatter 1357 Q_FORMATTER = new InlineElementFormatter() { 1358 1359 @Override public void 1360 format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException { 1361 1362 final String cite = element.getAttribute("cite"); 1363 1364 output.append('"'); 1365 output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes()))); 1366 output.append("\""); 1367 if (!cite.isEmpty()) output.append(" (").append(cite).append(')'); 1368 } 1369 }; 1370 1371 /** 1372 * Simply appends the element's formatted content, a.k.a. "ignoring a tag". 1373 */ 1374 private static final InlineElementFormatter 1375 IGNORE_INLINE_ELEMENT_FORMATTER = new SimpleInlineElementFormatter("", ""); 1376 1377 /** 1378 * Concatenates the <var>prefix</var>, the element's formatted content, and the <var>suffix</var>. 1379 */ 1380 static 1381 class SimpleInlineElementFormatter implements InlineElementFormatter { 1382 1383 private final String prefix, suffix; 1384 1385 /** 1386 * Formats enclosed text by prepending the <var>prefix</var> and appending the <var>suffix</var> to it. 1387 */ 1388 public 1389 SimpleInlineElementFormatter(String prefix, String suffix) { 1390 this.prefix = prefix; 1391 this.suffix = suffix; 1392 } 1393 1394 @Override public void 1395 format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException { 1396 1397 output.append(this.prefix); 1398 output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes()))); 1399 output.append(this.suffix); 1400 } 1401 } 1402 1403 private static final InlineElementFormatter 1404 NYI_INLINE_ELEMENT_FORMATTER = new InlineElementFormatter() { 1405 1406 @Override public void 1407 format(Html2Txt html2Txt, Element element, StringBuilder output) throws HtmlException { 1408 1409 html2Txt.htmlErrorHandler.warning( 1410 new HtmlException( 1411 element, 1412 "HTML inline element \"<" + element.getNodeName() + ">\" is not yet implemented and thus ignored" 1413 ) 1414 ); 1415 1416 output.append(html2Txt.getBlock(XmlUtil.iterable(element.getChildNodes()))); 1417 } 1418 }; 1419 1420 /** 1421 * Defines the strategies for formatting HTML inline elements. 1422 * <p> 1423 * To see the HTML inline elements and how they are formatted, click the word "{@code ALL_INLINE_ELEMENTS}" 1424 * (right above). The right hand side of the mapping means: 1425 * <dl> 1426 * <dt>{@link #NYI_INLINE_ELEMENT_FORMATTER}</dt> 1427 * <dd> 1428 * Issues a "Not yet implemented" warning. 1429 * </dd> 1430 * <dt>{@link #IGNORE_INLINE_ELEMENT_FORMATTER}</dt> 1431 * <dd> 1432 * The element is simply replaced with its content (a.k.a. "the element is ignored"). 1433 * </dd> 1434 * <dt>{@code new} {@link Html2Txt.SimpleInlineElementFormatter SimpleInlineElementFormatter("foo", "bar")}</dt> 1435 * <dd> 1436 * The element is replaced with "{@code foo}", the element content, and "{@code bar}". 1437 * </dd> 1438 * <dt>(Other)</dt> 1439 * <dd> 1440 * This HTML inline element is formatted specially; see the respective field documentation on this page (e.g. 1441 * {@link #A_FORMATTER}). 1442 * </dd> 1443 * </dl> 1444 * </p> 1445 */ 1446 protected static final Map<String, InlineElementFormatter> 1447 ALL_INLINE_ELEMENTS = CollectionUtil.<String, InlineElementFormatter>map( 1448 "a", Html2Txt.A_FORMATTER, 1449 "abbr", Html2Txt.ABBR_FORMATTER, 1450 "acronym", Html2Txt.ABBR_FORMATTER, 1451 "b", new SimpleInlineElementFormatter("*", "*"), 1452 "bdo", Html2Txt.NYI_INLINE_ELEMENT_FORMATTER, 1453 "big", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1454 "br", Html2Txt.BR_FORMATTER, 1455 "button", new SimpleInlineElementFormatter("[ ", " ]"), 1456 "cite", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1457 "code", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1458 "dfn", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1459 "em", new SimpleInlineElementFormatter("<", ">"), 1460 "i", new SimpleInlineElementFormatter("<", ">"), 1461 "img", Html2Txt.IMG_FORMATTER, 1462 "input", Html2Txt.INPUT_FORMATTER, 1463 "kbd", new SimpleInlineElementFormatter("[ ", " ]"), 1464 "label", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1465 "map", Html2Txt.NYI_INLINE_ELEMENT_FORMATTER, 1466 "object", Html2Txt.NYI_INLINE_ELEMENT_FORMATTER, 1467 "q", Html2Txt.Q_FORMATTER, 1468 "samp", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1469 "script", Html2Txt.NYI_INLINE_ELEMENT_FORMATTER, 1470 "select", new SimpleInlineElementFormatter("[ ", " ]"), 1471 "small", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1472 "span", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1473 "strong", new SimpleInlineElementFormatter("*", "*"), 1474 "sub", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1475 "sup", new SimpleInlineElementFormatter("^", ""), 1476 "textarea", new SimpleInlineElementFormatter("[ ", " ]"), 1477 "tt", Html2Txt.IGNORE_INLINE_ELEMENT_FORMATTER, 1478 "u", new SimpleInlineElementFormatter("_", "_"), 1479 "var", new SimpleInlineElementFormatter("<", ">") 1480 ); 1481 1482 /** 1483 * Wraps the given <var>delegate</var> such that it right-pads the products with <var>c</var> to the given 1484 * <var>width</var>. 1485 */ 1486 public static Producer<? extends String> 1487 rightPad(final Producer<? extends CharSequence> delegate, final int width, final char c) { 1488 1489 return new Producer<String>() { 1490 1491 @Override @Nullable public String 1492 produce() { 1493 CharSequence cs = delegate.produce(); 1494 if (cs == null) return null; 1495 return ( 1496 cs.length() < width 1497 ? cs + StringUtil.repeat(width - cs.length(), c) 1498 : cs.toString() 1499 ); 1500 } 1501 }; 1502 } 1503 1504 /** 1505 * Creates and returns a {@link Consumer} that forwards its subjects to the <var>delegate</var>, with trailing 1506 * spaces ({@code ' '}) removed. 1507 */ 1508 public static Consumer<CharSequence> 1509 rightTrim(final Consumer<? super String> delegate) { 1510 1511 return new Consumer<CharSequence>() { 1512 1513 @Override public void 1514 consume(CharSequence subject) { 1515 1516 int len = subject.length(); 1517 1518 if (len == 0 || subject.charAt(len - 1) != ' ') { 1519 delegate.consume(subject.toString()); 1520 } else { 1521 1522 for (len -= 2; len >= 0 && subject.charAt(len) == ' '; len--); 1523 1524 delegate.consume(subject.toString().substring(0, len + 1)); 1525 } 1526 } 1527 }; 1528 } 1529}