Skip to content

Commit

Permalink
Massively improve whitespace handling
Browse files Browse the repository at this point in the history
  • Loading branch information
DeDiamondPro committed Aug 11, 2024
1 parent 8f90383 commit 43f8a25
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 105 deletions.
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ plugins {
}

group = "dev.dediamondpro"
version = "1.2.2"
version = "1.2.3"

repositories {
mavenCentral()
Expand Down
18 changes: 5 additions & 13 deletions src/main/java/dev/dediamondpro/minemark/MineMarkCore.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import dev.dediamondpro.minemark.elements.creators.TextElementCreator;
import dev.dediamondpro.minemark.elements.formatting.FormattingElement;
import dev.dediamondpro.minemark.style.Style;
import dev.dediamondpro.minemark.utils.PrefixedReader;
import dev.dediamondpro.minemark.utils.HtmlWhiteSpaceUtil;
import org.commonmark.Extension;
import org.commonmark.node.Node;
import org.commonmark.parser.Parser;
Expand All @@ -38,7 +38,6 @@
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;

/**
* Class responsible for integrating parsing, layout and rendering
Expand All @@ -47,7 +46,6 @@
* @param <R> The class passed to the rendering implementation at render time
*/
public class MineMarkCore<S extends Style, R> {
private static final Pattern ACTIVATION_PATTERN = Pattern.compile("<minemark-activator>.*?</minemark-activator>", Pattern.DOTALL);
private final Parser markdownParser;
private final HtmlRenderer htmlRenderer;
private final MineMarkHtmlParser<S, R> htmlParser;
Expand All @@ -69,8 +67,8 @@ protected MineMarkCore(TextElementCreator<S, R> textElement, List<ElementCreator
}
this.htmlRenderer = htmlRendererBuilder.build();
this.htmlParser = new MineMarkHtmlParser<>(textElement, elements, formattingElements);
xmlParser = new org.ccil.cowan.tagsoup.Parser();
xmlParser.setContentHandler(htmlParser);
this.xmlParser = new org.ccil.cowan.tagsoup.Parser();
this.xmlParser.setContentHandler(htmlParser);
}

/**
Expand All @@ -84,9 +82,6 @@ protected MineMarkCore(TextElementCreator<S, R> textElement, List<ElementCreator
* @throws IOException An IOException during parsing
*/
public MineMarkElement<S, R> parse(@NotNull S style, @NotNull String markdown, @NotNull Charset charSet) throws SAXException, IOException {
// Trick the markdown renderer to activate early,
// this makes it so some problematic whitespaces are handled for us
markdown = "<minemark-activator>\n\n**MineMark-activation**\n\n</minemark-activator>" + markdown;
Node document = markdownParser.parse(markdown);
return parseDocument(style, document, charSet);
}
Expand Down Expand Up @@ -115,9 +110,6 @@ public MineMarkElement<S, R> parse(@NotNull S style, @NotNull String markdown) t
* @throws IOException An IOException during parsing
*/
public MineMarkElement<S, R> parse(@NotNull S style, @NotNull Reader markdown, @NotNull Charset charSet) throws SAXException, IOException {
// Trick the markdown renderer to activate early,
// this makes it so some problematic whitespaces are handled for us
markdown = new PrefixedReader("<minemark-activator>\n\n**MineMark-activation**\n\n</minemark-activator>", markdown);
Node document = markdownParser.parseReader(markdown);
return parseDocument(style, document, charSet);
}
Expand All @@ -138,12 +130,12 @@ public MineMarkElement<S, R> parse(@NotNull S style, @NotNull Reader markdown) t
private MineMarkElement<S, R> parseDocument(@NotNull S style, Node document, @NotNull Charset charSet) throws SAXException, IOException {
// Render the document to HTML
String html = htmlRenderer.render(document);
// Remove the markdown activation part
html = ACTIVATION_PATTERN.matcher(html).replaceFirst("");
// Get the wrapper to wrap the content with, make sure the html does not include it
String wrapper = getMineMarkWrapper(html);
// Prepare the HTML for parsing
html = "<" + wrapper + ">" + html + "</" + wrapper + ">";
// Remove unnecessary whitespaces from the html
html = HtmlWhiteSpaceUtil.INSTANCE.removeUnnecessaryWhiteSpace(html);
// Acquire the lock to make sure this thread is the only one using the parser
parsingLock.lock();
try (InputStream stream = new ByteArrayInputStream(html.getBytes(charSet))) {
Expand Down
31 changes: 13 additions & 18 deletions src/main/java/dev/dediamondpro/minemark/MineMarkHtmlParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,31 +107,26 @@ public void endElement(String uri, String localName, String qName) {

@Override
public void characters(char[] ch, int start, int length) {
// All newlines are ignored unless this element is preformatted
int newLength = length;
if (isPreFormatted) {
if (ch[start] == '\n') {
newLength--;
}
if (ch[start + length - 1] == '\n') {
newLength--;
}
} else {
// Remove leading and trailing newlines
if (length > 0 && ch[start] == '\n') {
start++;
length--;
}
if (length > 0 && ch[start + length - 1] == '\n') {
length--;
}
// Replace all other newlines with a space if the current text isn't preformatted
if (!isPreFormatted) {
for (int i = start; i < start + length; i++) {
if (ch[i] == '\n') {
newLength--;
ch[i] = ' ';
}
}
}

char[] modifiedCh = new char[newLength];
int index = 0;
for (int i = start; i < start + length; i++) {
if (ch[i] != '\n' || (isPreFormatted && i != start && i != start + length - 1)) {
modifiedCh[index++] = ch[i];
}
if (length > 0) {
textBuilder.append(ch, start, length);
}
textBuilder.append(modifiedCh);
}

private void addText() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

public abstract class TextElement<S extends Style, R> extends Element<S, R> implements Inline {
protected final Pattern LEADING_WHITESPACE = Pattern.compile("^ +");
protected final HashMap<LayoutData.MarkDownElementPosition, String> lines = new HashMap<>();
protected final String text;
protected float baseLineHeight;
Expand All @@ -48,18 +50,10 @@ public TextElement(@NotNull String text, @NotNull S style, @NotNull LayoutStyle
public void generateLayout(LayoutData layoutData, R renderData) {
lines.clear();
ArrayList<String> allLines = new ArrayList<>();
String actualText = text;
if (!getLayoutStyle().isPreFormatted()) {
actualText = actualText.replaceAll(" +", " ");
}
String[] predefinedLines = actualText.split("\n", -1);
String[] predefinedLines = text.split("\n", -1);
for (int i = 0; i < predefinedLines.length; i++) {
String line = predefinedLines[i].replace("\n", "");
if (layoutStyle.isPreFormatted()) {
allLines.add(line);
} else {
allLines.addAll(wrapText(line, i == 0 ? layoutData.getX() : 0f, layoutData.getMaxWidth(), renderData));
}
allLines.addAll(wrapText(line, i == 0 ? layoutData.getX() : 0f, layoutData.getMaxWidth(), renderData));
}
float codeBlockPadding = layoutStyle.isPartOfCodeBlock() ? style.getCodeBlockStyle().getInlinePaddingTopBottom() : 0f;
float padding = Math.max(style.getTextStyle().getPadding(), codeBlockPadding);
Expand Down Expand Up @@ -115,16 +109,23 @@ protected List<String> wrapText(String text, float startX, float maxWidth, R ren

String[] words = text.split("(?= )");
float actualMaxWidth = maxWidth - startX;
boolean firstOfLine = actualMaxWidth == maxWidth;
for (String word : words) {
word = word.replace('\u00A0', ' ');
// If this is the first word on the line, replace all leading whitespace chars (unless this is preformatted)
if (firstOfLine && !layoutStyle.isPreFormatted()) {
word = LEADING_WHITESPACE.matcher(word).replaceAll("");
firstOfLine = false;
}
if (getAdjustedTextWidth(currentLine + word, fontSize, renderData) <= actualMaxWidth) {
currentLine.append(word);
} else {
String finishedText = currentLine.toString();
if (!finishedText.isEmpty() || actualMaxWidth != maxWidth) {
lines.add(finishedText);
}
String cleanedWord = word.replaceAll("^ ", "");
// This is the first word after wrapping, replace all leading whitespace chars
String cleanedWord = LEADING_WHITESPACE.matcher(word).replaceAll("");
currentLine = new StringBuilder();
actualMaxWidth = maxWidth;
if (getAdjustedTextWidth(cleanedWord, fontSize, renderData) > actualMaxWidth) {
Expand Down
101 changes: 101 additions & 0 deletions src/main/java/dev/dediamondpro/minemark/utils/HtmlWhiteSpaceUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* This file is part of MineMark
* Copyright (C) 2024 DeDiamondPro
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License Version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

package dev.dediamondpro.minemark.utils;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class HtmlWhiteSpaceUtil {
public static final HtmlWhiteSpaceUtil INSTANCE = new HtmlWhiteSpaceUtil();
private final Pattern PREFORMATTED_ELEMENT = Pattern.compile("<pre>(?:(?!</?pre>).)*?</pre>", Pattern.DOTALL);
private final Pattern BEFORE_AFTER_LINEBREAK = Pattern.compile("\\s+$|^\\s+", Pattern.MULTILINE);
private final Pattern HORIZONTAL_WHITESPACE = Pattern.compile("\\h");
private final Pattern SUBSEQUENT_SPACE = Pattern.compile(" {2,}");
private final Pattern SPACES_ACROSS_TAGS = Pattern.compile(" +((</?[^<>]+>)+) +");
private final Pattern SPACES_START = Pattern.compile("^((</?[^<>]+>)+) +");
private final Pattern SPACES_END = Pattern.compile(" +((</?[^<>]+>)+)$");

private HtmlWhiteSpaceUtil() {
}

public String removeUnnecessaryWhiteSpace(String html) {
// Based on the steps outlined here: https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace
// Start by removing leading and trailing whitespaces
html = html.trim();

// Find preformatted elements and replace them with a placeholder, we don't want to change the formatting
LinkedHashMap<String, String> preformattedElements = null;
Matcher preformattedElementsMatcher = PREFORMATTED_ELEMENT.matcher(html);
int num = 0;
boolean found = false;
while (true) {
while (preformattedElementsMatcher.find()) {
// Initialize here to avoid a wasted initialization
if (preformattedElements == null) {
preformattedElements = new LinkedHashMap<>();
}
// Find a key to replace the element with, this can absolutely not already be in the string
String key = "%%%preformattedElement-" + num + "%%%%";
while (html.contains(key)) {
num++;
key = "%%%preformattedElement-" + num + "%%%%";
}
num++;
// Replace the element in the html with the key
String element = preformattedElementsMatcher.group();
// Store the element so it can be re-added later
preformattedElements.put(key, element);
// Remove the element from the html
html = html.replace(element, key);

found = true;
}
// If no more occurrences are found with a new matcher, we are done, break out of the loop
if (!found) {
break;
}
// Recreate the matcher to handle nested elements
preformattedElementsMatcher = PREFORMATTED_ELEMENT.matcher(html);
found = false;
}

// Ignore whitespace before and after line breaks
html = BEFORE_AFTER_LINEBREAK.matcher(html).replaceAll("");
// Replace all whitespace characters with normal spaces
html = HORIZONTAL_WHITESPACE.matcher(html).replaceAll(" ");
// Remove subsequent spaces
html = SUBSEQUENT_SPACE.matcher(html).replaceAll(" ");
// Remove spaces across tags
html = SPACES_ACROSS_TAGS.matcher(html).replaceAll(" $1");
// Remove spaces at the start and end of the string across the first and last tag
html = SPACES_START.matcher(html).replaceAll("$1");
html = SPACES_END.matcher(html).replaceAll("$1");

// Add preformatted elements back in, in reverse order to handle nested elements
if (preformattedElements != null) {
ArrayList<String> keys = new ArrayList<>(preformattedElements.keySet());
Collections.reverse(keys);
for (String key : keys) {
html = html.replace(key, preformattedElements.get(key));
}
}

return html;
}
}
62 changes: 0 additions & 62 deletions src/main/java/dev/dediamondpro/minemark/utils/PrefixedReader.java

This file was deleted.

0 comments on commit 43f8a25

Please sign in to comment.