From 0a4b830ccb9df906c11d154c774e14fb2eb5deff Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 29 Nov 2024 10:47:55 +1100 Subject: [PATCH] Fix XML attribute validation When converting from the `[a-zA-Z_:][-a-zA-Z0-9_:.]*` regex, missed `-`, `.`, and digits. Also clarified HTML attribute scan. Fixes #2235 --- CHANGES.md | 7 +++++++ pom.xml | 4 ++-- src/main/java/org/jsoup/nodes/Attribute.java | 8 ++++---- src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java | 9 +++++++++ 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 3233cbf89b..b418964c75 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,12 @@ # jsoup Changelog +## 1.18.3 (PENDING) + +### Bug Fixes + +* When serializing to XML, attribute names containing `-`, `.`, or digits were incorrectly marked as invalid and + removed. [2235](https://github.com/jhy/jsoup/issues/2235) + ## 1.18.2 (2024-Nov-27) ### Improvements diff --git a/pom.xml b/pom.xml index 66bae439a8..7e335bc00a 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ org.jsoup jsoup - 1.19.1-SNAPSHOT + 1.18.3-SNAPSHOT https://jsoup.org/ jsoup is a Java library that simplifies working with real-world HTML and XML. It offers an easy-to-use API for URL fetching, data parsing, extraction, and manipulation using DOM API methods, CSS, and xpath selectors. jsoup implements the WHATWG HTML5 specification, and parses HTML to the same DOM as modern browsers. 2009 @@ -237,7 +237,7 @@ org.jsoup jsoup - 1.18.1 + 1.18.2 jar diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java index cf9467f798..9f82881e7e 100644 --- a/src/main/java/org/jsoup/nodes/Attribute.java +++ b/src/main/java/org/jsoup/nodes/Attribute.java @@ -199,13 +199,13 @@ else if (syntax == Syntax.html && !isValidHtmlKey(key)) { private static boolean isValidXmlKey(String key) { // =~ [a-zA-Z_:][-a-zA-Z0-9_:.]* final int length = key.length(); - if (length ==0) return false; + if (length == 0) return false; char c = key.charAt(0); if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':')) return false; for (int i = 1; i < length; i++) { c = key.charAt(i); - if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':')) + if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '-' || c == '_' || c == ':' || c == '.')) return false; } return true; @@ -214,10 +214,10 @@ private static boolean isValidXmlKey(String key) { private static boolean isValidHtmlKey(String key) { // =~ [\x00-\x1f\x7f-\x9f "'/=]+ final int length = key.length(); - if (length ==0) return false; + if (length == 0) return false; for (int i = 0; i < length; i++) { char c = key.charAt(i); - if (c <= 0x1f || c >= 0x7f && c <= 0x9f || c == ' ' || c == '"' || c == '\'' || c == '/' || c == '=') + if ((c <= 0x1f) || (c >= 0x7f && c <= 0x9f) || c == ' ' || c == '"' || c == '\'' || c == '/' || c == '=') return false; } return true; diff --git a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java index c57947bfde..88e89bb3d9 100644 --- a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java +++ b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java @@ -299,6 +299,15 @@ public void handlesLTinScript() { assertEquals("
", out); } + @Test void xmlValidAttributes() { + String xml = "One"; + Document doc = Jsoup.parse(xml, Parser.xmlParser()); + assertEquals(Syntax.xml, doc.outputSettings().syntax()); + + String out = doc.html(); + assertEquals("One", out); // first is same, second coerced + } + @Test void customTagsAreFlyweights() { String xml = "FooFooFOOFOO"; Document doc = Jsoup.parse(xml, Parser.xmlParser());