From bd8707fa6a38069fb1a127b91efe7d5cc3d8bca5 Mon Sep 17 00:00:00 2001
From: mshannon-sil <131058912+mshannon-sil@users.noreply.github.com>
Date: Fri, 15 Nov 2024 09:30:46 -0500
Subject: [PATCH] Raise error when id tag doesn't match filename book id (#141)
* raise error when id tag doesn't match filename book id
* Revert "raise error when id tag doesn't match filename book id"
This reverts commit 8679b785cdecb2a427726baf55b023a80496498e.
* raise error on invalid and mismatched book ids, take 2
---
machine/corpora/usfm_text_base.py | 8 +++++
tests/corpora/test_scripture_text_corpus.py | 14 +++++++-
tests/corpora/test_usfm_file_text.py | 16 ++++++++-
tests/testutils/corpora_test_helpers.py | 2 ++
.../testutils/data/usfm/invalid_id/07JDG.SFM | 5 +++
.../data/usfm/invalid_id/Settings.xml | 34 +++++++++++++++++++
.../testutils/data/usfm/invalid_id/custom.vrs | 31 +++++++++++++++++
.../testutils/data/usfm/mismatch_id/07JDG.SFM | 5 +++
.../data/usfm/mismatch_id/Settings.xml | 34 +++++++++++++++++++
.../data/usfm/mismatch_id/custom.vrs | 31 +++++++++++++++++
10 files changed, 178 insertions(+), 2 deletions(-)
create mode 100644 tests/testutils/data/usfm/invalid_id/07JDG.SFM
create mode 100644 tests/testutils/data/usfm/invalid_id/Settings.xml
create mode 100644 tests/testutils/data/usfm/invalid_id/custom.vrs
create mode 100644 tests/testutils/data/usfm/mismatch_id/07JDG.SFM
create mode 100644 tests/testutils/data/usfm/mismatch_id/Settings.xml
create mode 100644 tests/testutils/data/usfm/mismatch_id/custom.vrs
diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py
index 1fd96f3..c150779 100644
--- a/machine/corpora/usfm_text_base.py
+++ b/machine/corpora/usfm_text_base.py
@@ -2,6 +2,7 @@
from io import TextIOWrapper
from typing import Generator, Iterable, List, Optional, Sequence
+from ..scripture.canon import ALL_BOOK_IDS
from ..scripture.verse_ref import Versification
from ..utils.string_utils import has_sentence_ending
from .corpora_utils import gen
@@ -90,6 +91,13 @@ def __init__(self, text: UsfmTextBase) -> None:
def rows(self) -> Iterable[TextRow]:
return self._rows
+ def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
+ super().start_book(state, marker, code)
+ if code not in ALL_BOOK_IDS:
+ raise ValueError(f"The book {code} is not a valid book id.")
+ if code != self._text.id:
+ raise ValueError(f"The \\id marker {code} does not match the text id {self._text.id}.")
+
def verse(
self,
state: UsfmParserState,
diff --git a/tests/corpora/test_scripture_text_corpus.py b/tests/corpora/test_scripture_text_corpus.py
index 925c9ca..5bbbfca 100644
--- a/tests/corpora/test_scripture_text_corpus.py
+++ b/tests/corpora/test_scripture_text_corpus.py
@@ -1,4 +1,5 @@
-from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH
+from pytest import raises
+from testutils.corpora_test_helpers import USFM_MISMATCH_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH
from machine.corpora import ParatextTextCorpus, extract_scripture_corpus
from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef
@@ -59,3 +60,14 @@ def test_extract_scripture_corpus() -> None:
assert text == ""
assert orig_vref.exact_equals(VerseRef.from_string("MAT 2:12", ORIGINAL_VERSIFICATION))
assert corpus_vref is not None and corpus_vref.exact_equals(VerseRef.from_string("MAT 2:12", corpus.versification))
+
+
+def test_extract_scripture_corpus_mismatch_id() -> None:
+ corpus = ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True)
+
+ with raises(
+ RuntimeError,
+ match=r"An error occurred while parsing the text 'JDG' in project mismatch_id. "
+ r"Verse: JUD 1:0, line: 1, character: 1, error: 'The \\id marker JUD does not match the text id JDG.'",
+ ):
+ list(extract_scripture_corpus(corpus))
diff --git a/tests/corpora/test_usfm_file_text.py b/tests/corpora/test_usfm_file_text.py
index 383b95a..3f87fd3 100644
--- a/tests/corpora/test_usfm_file_text.py
+++ b/tests/corpora/test_usfm_file_text.py
@@ -1,4 +1,5 @@
-from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, scripture_ref
+from pytest import raises
+from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH, scripture_ref
from machine.corpora import ScriptureRef, UsfmFileTextCorpus
@@ -244,6 +245,19 @@ def test_get_rows_include_markers_all_text() -> None:
assert rows[26].text == "Here is some sidebar // content."
+def test_get_rows_invalid_id() -> None:
+ corpus = UsfmFileTextCorpus(USFM_INVALID_ID_PROJECT_PATH)
+
+ text = corpus.get_text("JGS")
+ assert text is not None
+ with raises(
+ RuntimeError,
+ match="An error occurred while parsing the text 'JGS'."
+ " Verse: 1:0, line: 1, character: 1, error: 'The book JGS is not a valid book id.",
+ ):
+ list(text)
+
+
def test_usfm_file_text_corpus_lowercase_usfm_id() -> None:
corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH)
diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py
index 4fd9341..e287560 100644
--- a/tests/testutils/corpora_test_helpers.py
+++ b/tests/testutils/corpora_test_helpers.py
@@ -9,6 +9,8 @@
USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes"
USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target"
USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source"
+USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id"
+USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id"
USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes"
TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt"
CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs"
diff --git a/tests/testutils/data/usfm/invalid_id/07JDG.SFM b/tests/testutils/data/usfm/invalid_id/07JDG.SFM
new file mode 100644
index 0000000..6d75497
--- /dev/null
+++ b/tests/testutils/data/usfm/invalid_id/07JDG.SFM
@@ -0,0 +1,5 @@
+\id JGS - Test
+\h Judges
+\mt Judges
+\c 1
+\v 1 Chapter one, verse one.
\ No newline at end of file
diff --git a/tests/testutils/data/usfm/invalid_id/Settings.xml b/tests/testutils/data/usfm/invalid_id/Settings.xml
new file mode 100644
index 0000000..aa24e29
--- /dev/null
+++ b/tests/testutils/data/usfm/invalid_id/Settings.xml
@@ -0,0 +1,34 @@
+
+ usfm.sty
+ 4
+ en:::
+ English
+ 8.0.100.76
+ Test
+ 65001
+ T
+
+ NFC
+ invalid_id
+ a7e0b3ce0200736062f9f810a444dbfbe64aca35
+ Charis SIL
+ 12
+
+
+
+ 41MAT
+
+ .SFM
+ Major::BiblicalTerms.xml
+ F
+ F
+ F
+ Public
+ Standard::
+
+ 3
+ 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+ 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000
+
+
+
\ No newline at end of file
diff --git a/tests/testutils/data/usfm/invalid_id/custom.vrs b/tests/testutils/data/usfm/invalid_id/custom.vrs
new file mode 100644
index 0000000..fb315af
--- /dev/null
+++ b/tests/testutils/data/usfm/invalid_id/custom.vrs
@@ -0,0 +1,31 @@
+# custom.vrs
+
+LEV 14:56
+ROM 14:26
+REV 12:17
+TOB 5:22
+TOB 10:12
+SIR 23:28
+ESG 1:22
+ESG 3:15
+ESG 5:14
+ESG 8:17
+ESG 10:14
+SIR 33:33
+SIR 41:24
+BAR 1:22
+4MA 7:25
+4MA 12:20
+
+# deliberately missing verses
+-ROM 16:26
+-ROM 16:27
+-3JN 1:15
+-S3Y 1:49
+-ESG 4:6
+-ESG 9:5
+-ESG 9:30
+
+LEV 14:55 = LEV 14:55
+LEV 14:55 = LEV 14:56
+LEV 14:56 = LEV 14:57
\ No newline at end of file
diff --git a/tests/testutils/data/usfm/mismatch_id/07JDG.SFM b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM
new file mode 100644
index 0000000..1959177
--- /dev/null
+++ b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM
@@ -0,0 +1,5 @@
+\id JUD - Test
+\h Judges
+\mt Judges
+\c 1
+\v 1 Chapter one, verse one.
\ No newline at end of file
diff --git a/tests/testutils/data/usfm/mismatch_id/Settings.xml b/tests/testutils/data/usfm/mismatch_id/Settings.xml
new file mode 100644
index 0000000..5e09b68
--- /dev/null
+++ b/tests/testutils/data/usfm/mismatch_id/Settings.xml
@@ -0,0 +1,34 @@
+
+ usfm.sty
+ 4
+ en:::
+ English
+ 8.0.100.76
+ Test
+ 65001
+ T
+
+ NFC
+ mismatch_id
+ a7e0b3ce0200736062f9f810a444dbfbe64aca35
+ Charis SIL
+ 12
+
+
+
+ 41MAT
+
+ .SFM
+ Major::BiblicalTerms.xml
+ F
+ F
+ F
+ Public
+ Standard::
+
+ 3
+ 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+ 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000
+
+
+
\ No newline at end of file
diff --git a/tests/testutils/data/usfm/mismatch_id/custom.vrs b/tests/testutils/data/usfm/mismatch_id/custom.vrs
new file mode 100644
index 0000000..fb315af
--- /dev/null
+++ b/tests/testutils/data/usfm/mismatch_id/custom.vrs
@@ -0,0 +1,31 @@
+# custom.vrs
+
+LEV 14:56
+ROM 14:26
+REV 12:17
+TOB 5:22
+TOB 10:12
+SIR 23:28
+ESG 1:22
+ESG 3:15
+ESG 5:14
+ESG 8:17
+ESG 10:14
+SIR 33:33
+SIR 41:24
+BAR 1:22
+4MA 7:25
+4MA 12:20
+
+# deliberately missing verses
+-ROM 16:26
+-ROM 16:27
+-3JN 1:15
+-S3Y 1:49
+-ESG 4:6
+-ESG 9:5
+-ESG 9:30
+
+LEV 14:55 = LEV 14:55
+LEV 14:55 = LEV 14:56
+LEV 14:56 = LEV 14:57
\ No newline at end of file