Massively improve whitespace handling

DeDiamondPro · Aug 11, 2024 · 43f8a25 · 43f8a25
1 parent 8f90383
commit 43f8a25
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 105 deletions.
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -21,7 +21,7 @@ plugins {
 }
 
 group = "dev.dediamondpro"
-version = "1.2.2"
+version = "1.2.3"
 
 repositories {
     mavenCentral()

diff --git a/src/main/java/dev/dediamondpro/minemark/MineMarkCore.java b/src/main/java/dev/dediamondpro/minemark/MineMarkCore.java
@@ -22,7 +22,7 @@
 import dev.dediamondpro.minemark.elements.creators.TextElementCreator;
 import dev.dediamondpro.minemark.elements.formatting.FormattingElement;
 import dev.dediamondpro.minemark.style.Style;
-import dev.dediamondpro.minemark.utils.PrefixedReader;
+import dev.dediamondpro.minemark.utils.HtmlWhiteSpaceUtil;
 import org.commonmark.Extension;
 import org.commonmark.node.Node;
 import org.commonmark.parser.Parser;
@@ -38,7 +38,6 @@
 import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.concurrent.locks.ReentrantLock;
-import java.util.regex.Pattern;
 
 /**
  * Class responsible for integrating parsing, layout and rendering
@@ -47,7 +46,6 @@
  * @param <R> The class passed to the rendering implementation at render time
  */
 public class MineMarkCore<S extends Style, R> {
-    private static final Pattern ACTIVATION_PATTERN = Pattern.compile("<minemark-activator>.*?</minemark-activator>", Pattern.DOTALL);
     private final Parser markdownParser;
     private final HtmlRenderer htmlRenderer;
     private final MineMarkHtmlParser<S, R> htmlParser;
@@ -69,8 +67,8 @@ protected MineMarkCore(TextElementCreator<S, R> textElement, List<ElementCreator
         }
         this.htmlRenderer = htmlRendererBuilder.build();
         this.htmlParser = new MineMarkHtmlParser<>(textElement, elements, formattingElements);
-        xmlParser = new org.ccil.cowan.tagsoup.Parser();
-        xmlParser.setContentHandler(htmlParser);
+        this.xmlParser = new org.ccil.cowan.tagsoup.Parser();
+        this.xmlParser.setContentHandler(htmlParser);
     }
 
     /**
@@ -84,9 +82,6 @@ protected MineMarkCore(TextElementCreator<S, R> textElement, List<ElementCreator
      * @throws IOException  An IOException during parsing
      */
     public MineMarkElement<S, R> parse(@NotNull S style, @NotNull String markdown, @NotNull Charset charSet) throws SAXException, IOException {
-        // Trick the markdown renderer to activate early,
-        // this makes it so some problematic whitespaces are handled for us
-        markdown = "<minemark-activator>\n\n**MineMark-activation**\n\n</minemark-activator>" + markdown;
         Node document = markdownParser.parse(markdown);
         return parseDocument(style, document, charSet);
     }
@@ -115,9 +110,6 @@ public MineMarkElement<S, R> parse(@NotNull S style, @NotNull String markdown) t
      * @throws IOException  An IOException during parsing
      */
     public MineMarkElement<S, R> parse(@NotNull S style, @NotNull Reader markdown, @NotNull Charset charSet) throws SAXException, IOException {
-        // Trick the markdown renderer to activate early,
-        // this makes it so some problematic whitespaces are handled for us
-        markdown = new PrefixedReader("<minemark-activator>\n\n**MineMark-activation**\n\n</minemark-activator>", markdown);
         Node document = markdownParser.parseReader(markdown);
         return parseDocument(style, document, charSet);
     }
@@ -138,12 +130,12 @@ public MineMarkElement<S, R> parse(@NotNull S style, @NotNull Reader markdown) t
     private MineMarkElement<S, R> parseDocument(@NotNull S style, Node document, @NotNull Charset charSet) throws SAXException, IOException {
         // Render the document to HTML
         String html = htmlRenderer.render(document);
-        // Remove the markdown activation part
-        html = ACTIVATION_PATTERN.matcher(html).replaceFirst("");
         // Get the wrapper to wrap the content with, make sure the html does not include it
         String wrapper = getMineMarkWrapper(html);
         // Prepare the HTML for parsing
         html = "<" + wrapper + ">" + html + "</" + wrapper + ">";
+        // Remove unnecessary whitespaces from the html
+        html = HtmlWhiteSpaceUtil.INSTANCE.removeUnnecessaryWhiteSpace(html);
         // Acquire the lock to make sure this thread is the only one using the parser
         parsingLock.lock();
         try (InputStream stream = new ByteArrayInputStream(html.getBytes(charSet))) {

diff --git a/src/main/java/dev/dediamondpro/minemark/MineMarkHtmlParser.java b/src/main/java/dev/dediamondpro/minemark/MineMarkHtmlParser.java
@@ -107,31 +107,26 @@ public void endElement(String uri, String localName, String qName) {
 
     @Override
     public void characters(char[] ch, int start, int length) {
-        // All newlines are ignored unless this element is preformatted
-        int newLength = length;
-        if (isPreFormatted) {
-            if (ch[start] == '\n') {
-                newLength--;
-            }
-            if (ch[start + length - 1] == '\n') {
-                newLength--;
-            }
-        } else {
+        // Remove leading and trailing newlines
+        if (length > 0 && ch[start] == '\n') {
+            start++;
+            length--;
+        }
+        if (length > 0 && ch[start + length - 1] == '\n') {
+            length--;
+        }
+        // Replace all other newlines with a space if the current text isn't preformatted
+        if (!isPreFormatted) {
             for (int i = start; i < start + length; i++) {
                 if (ch[i] == '\n') {
-                    newLength--;
+                    ch[i] = ' ';
                 }
             }
         }
 
-        char[] modifiedCh = new char[newLength];
-        int index = 0;
-        for (int i = start; i < start + length; i++) {
-            if (ch[i] != '\n' || (isPreFormatted && i != start && i != start + length - 1)) {
-                modifiedCh[index++] = ch[i];
-            }
+        if (length > 0) {
+            textBuilder.append(ch, start, length);
         }
-        textBuilder.append(modifiedCh);
     }
 
     private void addText() {

diff --git a/src/main/java/dev/dediamondpro/minemark/elements/impl/TextElement.java b/src/main/java/dev/dediamondpro/minemark/elements/impl/TextElement.java
@@ -31,8 +31,10 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Pattern;
 
 public abstract class TextElement<S extends Style, R> extends Element<S, R> implements Inline {
+    protected final Pattern LEADING_WHITESPACE = Pattern.compile("^ +");
     protected final HashMap<LayoutData.MarkDownElementPosition, String> lines = new HashMap<>();
     protected final String text;
     protected float baseLineHeight;
@@ -48,18 +50,10 @@ public TextElement(@NotNull String text, @NotNull S style, @NotNull LayoutStyle
     public void generateLayout(LayoutData layoutData, R renderData) {
         lines.clear();
         ArrayList<String> allLines = new ArrayList<>();
-        String actualText = text;
-        if (!getLayoutStyle().isPreFormatted()) {
-            actualText = actualText.replaceAll(" +", " ");
-        }
-        String[] predefinedLines = actualText.split("\n", -1);
+        String[] predefinedLines = text.split("\n", -1);
         for (int i = 0; i < predefinedLines.length; i++) {
             String line = predefinedLines[i].replace("\n", "");
-            if (layoutStyle.isPreFormatted()) {
-                allLines.add(line);
-            } else {
-                allLines.addAll(wrapText(line, i == 0 ? layoutData.getX() : 0f, layoutData.getMaxWidth(), renderData));
-            }
+            allLines.addAll(wrapText(line, i == 0 ? layoutData.getX() : 0f, layoutData.getMaxWidth(), renderData));
         }
         float codeBlockPadding = layoutStyle.isPartOfCodeBlock() ? style.getCodeBlockStyle().getInlinePaddingTopBottom() : 0f;
         float padding = Math.max(style.getTextStyle().getPadding(), codeBlockPadding);
@@ -115,16 +109,23 @@ protected List<String> wrapText(String text, float startX, float maxWidth, R ren
 
         String[] words = text.split("(?= )");
         float actualMaxWidth = maxWidth - startX;
+        boolean firstOfLine = actualMaxWidth == maxWidth;
         for (String word : words) {
             word = word.replace('\u00A0', ' ');
+            // If this is the first word on the line, replace all leading whitespace chars (unless this is preformatted)
+            if (firstOfLine && !layoutStyle.isPreFormatted()) {
+                word = LEADING_WHITESPACE.matcher(word).replaceAll("");
+                firstOfLine = false;
+            }
             if (getAdjustedTextWidth(currentLine + word, fontSize, renderData) <= actualMaxWidth) {
                 currentLine.append(word);
             } else {
                 String finishedText = currentLine.toString();
                 if (!finishedText.isEmpty() || actualMaxWidth != maxWidth) {
                     lines.add(finishedText);
                 }
-                String cleanedWord = word.replaceAll("^ ", "");
+                // This is the first word after wrapping, replace all leading whitespace chars
+                String cleanedWord = LEADING_WHITESPACE.matcher(word).replaceAll("");
                 currentLine = new StringBuilder();
                 actualMaxWidth = maxWidth;
                 if (getAdjustedTextWidth(cleanedWord, fontSize, renderData) > actualMaxWidth) {

diff --git a/src/main/java/dev/dediamondpro/minemark/utils/HtmlWhiteSpaceUtil.java b/src/main/java/dev/dediamondpro/minemark/utils/HtmlWhiteSpaceUtil.java
@@ -0,0 +1,101 @@
+/*
+ * This file is part of MineMark
+ * Copyright (C) 2024 DeDiamondPro
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License Version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+package dev.dediamondpro.minemark.utils;
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class HtmlWhiteSpaceUtil {
+    public static final HtmlWhiteSpaceUtil INSTANCE = new HtmlWhiteSpaceUtil();
+    private final Pattern PREFORMATTED_ELEMENT = Pattern.compile("<pre>(?:(?!</?pre>).)*?</pre>", Pattern.DOTALL);
+    private final Pattern BEFORE_AFTER_LINEBREAK = Pattern.compile("\\s+$|^\\s+", Pattern.MULTILINE);
+    private final Pattern HORIZONTAL_WHITESPACE = Pattern.compile("\\h");
+    private final Pattern SUBSEQUENT_SPACE = Pattern.compile(" {2,}");
+    private final Pattern SPACES_ACROSS_TAGS = Pattern.compile(" +((</?[^<>]+>)+) +");
+    private final Pattern SPACES_START = Pattern.compile("^((</?[^<>]+>)+) +");
+    private final Pattern SPACES_END = Pattern.compile(" +((</?[^<>]+>)+)$");
+
+    private HtmlWhiteSpaceUtil() {
+    }
+
+    public String removeUnnecessaryWhiteSpace(String html) {
+        // Based on the steps outlined here: https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace
+        // Start by removing leading and trailing whitespaces
+        html = html.trim();
+
+        // Find preformatted elements and replace them with a placeholder, we don't want to change the formatting
+        LinkedHashMap<String, String> preformattedElements = null;
+        Matcher preformattedElementsMatcher = PREFORMATTED_ELEMENT.matcher(html);
+        int num = 0;
+        boolean found = false;
+        while (true) {
+            while (preformattedElementsMatcher.find()) {
+                // Initialize here to avoid a wasted initialization
+                if (preformattedElements == null) {
+                    preformattedElements = new LinkedHashMap<>();
+                }
+                // Find a key to replace the element with, this can absolutely not already be in the string
+                String key = "%%%preformattedElement-" + num + "%%%%";
+                while (html.contains(key)) {
+                    num++;
+                    key = "%%%preformattedElement-" + num + "%%%%";
+                }
+                num++;
+                // Replace the element in the html with the key
+                String element = preformattedElementsMatcher.group();
+                // Store the element so it can be re-added later
+                preformattedElements.put(key, element);
+                // Remove the element from the html
+                html = html.replace(element, key);
+
+                found = true;
+            }
+            // If no more occurrences are found with a new matcher, we are done, break out of the loop
+            if (!found) {
+                break;
+            }
+            // Recreate the matcher to handle nested elements
+            preformattedElementsMatcher = PREFORMATTED_ELEMENT.matcher(html);
+            found = false;
+        }
+
+        // Ignore whitespace before and after line breaks
+        html = BEFORE_AFTER_LINEBREAK.matcher(html).replaceAll("");
+        // Replace all whitespace characters with normal spaces
+        html = HORIZONTAL_WHITESPACE.matcher(html).replaceAll(" ");
+        // Remove subsequent spaces
+        html = SUBSEQUENT_SPACE.matcher(html).replaceAll(" ");
+        // Remove spaces across tags
+        html = SPACES_ACROSS_TAGS.matcher(html).replaceAll(" $1");
+        // Remove spaces at the start and end of the string across the first and last tag
+        html = SPACES_START.matcher(html).replaceAll("$1");
+        html = SPACES_END.matcher(html).replaceAll("$1");
+
+        // Add preformatted elements back in, in reverse order to handle nested elements
+        if (preformattedElements != null) {
+            ArrayList<String> keys = new ArrayList<>(preformattedElements.keySet());
+            Collections.reverse(keys);
+            for (String key : keys) {
+                html = html.replace(key, preformattedElements.get(key));
+            }
+        }
+
+        return html;
+    }
+}
diff --git a/src/main/java/dev/dediamondpro/minemark/utils/PrefixedReader.java b/src/main/java/dev/dediamondpro/minemark/utils/PrefixedReader.java