From 4be65312f7bbd4b09cff66433ebd95b30067b690 Mon Sep 17 00:00:00 2001 From: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com> Date: Mon, 27 May 2024 17:53:01 +0100 Subject: [PATCH] Fix duplicate values in org.apache.lucene.analysis.ko.dict.UserDictionary (#13427) Remove incorrect assertion in org.apache.lucene.analysis.ko.dict.UserDictionary, and replace with array copy if duplicate values are passed. --- .../lucene/analysis/ko/dict/UserDictionary.java | 5 ++++- .../lucene/analysis/ko/TestKoreanTokenizer.java | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java index 5614451df64b..4cebdf2024ca 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java @@ -23,6 +23,7 @@ import java.util.Comparator; import java.util.List; import org.apache.lucene.analysis.ko.POS; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; @@ -138,10 +139,12 @@ private UserDictionary(List entries) throws IOException { lastToken = token; ord++; } + if (entryIndex < rightIds.length) { + rightIds = ArrayUtil.copyOfSubArray(rightIds, 0, entryIndex); + } this.fst = new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader())); this.segmentations = segmentations.toArray(new int[segmentations.size()][]); - assert entryIndex == rightIds.length; this.rightIds = rightIds; } diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java index 315c876435c4..a2349a651d04 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java @@ -20,6 +20,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.Random; import org.apache.lucene.analysis.Analyzer; @@ -593,6 +594,22 @@ public void testCombining() throws IOException { new POS.Tag[] {POS.Tag.SL}); } + public void testDuplicate() throws IOException { + String s = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시"; + try (Reader rulesReader = new StringReader(s)) { + var dict = UserDictionary.open(rulesReader); + assertTrue(dict.getRightId(3) != 0); + assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4)); + } + + String dupdup = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시\n세종시 세종 시"; + try (Reader rulesReader = new StringReader(dupdup)) { + var dict = UserDictionary.open(rulesReader); + assertTrue(dict.getRightId(3) != 0); + assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4)); + } + } + private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException { try (TokenStream ts = analyzer.tokenStream("ignored", input)) {