Skip to content

Commit

Permalink
Fix XML attribute validation
Browse files Browse the repository at this point in the history
When converting from the `[a-zA-Z_:][-a-zA-Z0-9_:.]*` regex, missed `-`, `.`, and digits.

Also clarified HTML attribute scan.

Fixes #2235
  • Loading branch information
jhy committed Nov 28, 2024
1 parent f6e82f2 commit 0a4b830
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 6 deletions.
7 changes: 7 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# jsoup Changelog

## 1.18.3 (PENDING)

### Bug Fixes

* When serializing to XML, attribute names containing `-`, `.`, or digits were incorrectly marked as invalid and
removed. [2235](https://github.com/jhy/jsoup/issues/2235)

## 1.18.2 (2024-Nov-27)

### Improvements
Expand Down
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.19.1-SNAPSHOT</version><!-- remember to update previous version below for japicmp -->
<version>1.18.3-SNAPSHOT</version><!-- remember to update previous version below for japicmp -->
<url>https://jsoup.org/</url>
<description>jsoup is a Java library that simplifies working with real-world HTML and XML. It offers an easy-to-use API for URL fetching, data parsing, extraction, and manipulation using DOM API methods, CSS, and xpath selectors. jsoup implements the WHATWG HTML5 specification, and parses HTML to the same DOM as modern browsers.</description>
<inceptionYear>2009</inceptionYear>
Expand Down Expand Up @@ -237,7 +237,7 @@
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.18.1</version>
<version>1.18.2</version>
<type>jar</type>
</dependency>
</oldVersion>
Expand Down
8 changes: 4 additions & 4 deletions src/main/java/org/jsoup/nodes/Attribute.java
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,13 @@ else if (syntax == Syntax.html && !isValidHtmlKey(key)) {
private static boolean isValidXmlKey(String key) {
// =~ [a-zA-Z_:][-a-zA-Z0-9_:.]*
final int length = key.length();
if (length ==0) return false;
if (length == 0) return false;
char c = key.charAt(0);
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':'))
return false;
for (int i = 1; i < length; i++) {
c = key.charAt(i);
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':'))
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '-' || c == '_' || c == ':' || c == '.'))
return false;
}
return true;
Expand All @@ -214,10 +214,10 @@ private static boolean isValidXmlKey(String key) {
private static boolean isValidHtmlKey(String key) {
// =~ [\x00-\x1f\x7f-\x9f "'/=]+
final int length = key.length();
if (length ==0) return false;
if (length == 0) return false;
for (int i = 0; i < length; i++) {
char c = key.charAt(i);
if (c <= 0x1f || c >= 0x7f && c <= 0x9f || c == ' ' || c == '"' || c == '\'' || c == '/' || c == '=')
if ((c <= 0x1f) || (c >= 0x7f && c <= 0x9f) || c == ' ' || c == '"' || c == '\'' || c == '/' || c == '=')
return false;
}
return true;
Expand Down
9 changes: 9 additions & 0 deletions src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,15 @@ public void handlesLTinScript() {
assertEquals("<body style=\"color: red\" _=\"\" name_=\"\"><div _=\"\"></div></body>", out);
}

@Test void xmlValidAttributes() {
String xml = "<a bB1-_:.=foo _9!=bar>One</a>";
Document doc = Jsoup.parse(xml, Parser.xmlParser());
assertEquals(Syntax.xml, doc.outputSettings().syntax());

String out = doc.html();
assertEquals("<a bB1-_:.=\"foo\" _9_=\"bar\">One</a>", out); // first is same, second coerced
}

@Test void customTagsAreFlyweights() {
String xml = "<foo>Foo</foo><foo>Foo</foo><FOO>FOO</FOO><FOO>FOO</FOO>";
Document doc = Jsoup.parse(xml, Parser.xmlParser());
Expand Down

0 comments on commit 0a4b830

Please sign in to comment.