Skip to content

Commit

Permalink
Raise error when id tag doesn't match filename book id (#141)
Browse files Browse the repository at this point in the history
* raise error when id tag doesn't match filename book id

* Revert "raise error when id tag doesn't match filename book id"

This reverts commit 8679b78.

* raise error on invalid and mismatched book ids, take 2
  • Loading branch information
mshannon-sil authored Nov 15, 2024
1 parent 183fdfb commit bd8707f
Show file tree
Hide file tree
Showing 10 changed files with 178 additions and 2 deletions.
8 changes: 8 additions & 0 deletions machine/corpora/usfm_text_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from io import TextIOWrapper
from typing import Generator, Iterable, List, Optional, Sequence

from ..scripture.canon import ALL_BOOK_IDS
from ..scripture.verse_ref import Versification
from ..utils.string_utils import has_sentence_ending
from .corpora_utils import gen
Expand Down Expand Up @@ -90,6 +91,13 @@ def __init__(self, text: UsfmTextBase) -> None:
def rows(self) -> Iterable[TextRow]:
return self._rows

def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
super().start_book(state, marker, code)
if code not in ALL_BOOK_IDS:
raise ValueError(f"The book {code} is not a valid book id.")
if code != self._text.id:
raise ValueError(f"The \\id marker {code} does not match the text id {self._text.id}.")

def verse(
self,
state: UsfmParserState,
Expand Down
14 changes: 13 additions & 1 deletion tests/corpora/test_scripture_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH
from pytest import raises
from testutils.corpora_test_helpers import USFM_MISMATCH_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH

from machine.corpora import ParatextTextCorpus, extract_scripture_corpus
from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef
Expand Down Expand Up @@ -59,3 +60,14 @@ def test_extract_scripture_corpus() -> None:
assert text == ""
assert orig_vref.exact_equals(VerseRef.from_string("MAT 2:12", ORIGINAL_VERSIFICATION))
assert corpus_vref is not None and corpus_vref.exact_equals(VerseRef.from_string("MAT 2:12", corpus.versification))


def test_extract_scripture_corpus_mismatch_id() -> None:
corpus = ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True)

with raises(
RuntimeError,
match=r"An error occurred while parsing the text 'JDG' in project mismatch_id. "
r"Verse: JUD 1:0, line: 1, character: 1, error: 'The \\id marker JUD does not match the text id JDG.'",
):
list(extract_scripture_corpus(corpus))
16 changes: 15 additions & 1 deletion tests/corpora/test_usfm_file_text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, scripture_ref
from pytest import raises
from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH, scripture_ref

from machine.corpora import ScriptureRef, UsfmFileTextCorpus

Expand Down Expand Up @@ -244,6 +245,19 @@ def test_get_rows_include_markers_all_text() -> None:
assert rows[26].text == "Here is some sidebar // content."


def test_get_rows_invalid_id() -> None:
corpus = UsfmFileTextCorpus(USFM_INVALID_ID_PROJECT_PATH)

text = corpus.get_text("JGS")
assert text is not None
with raises(
RuntimeError,
match="An error occurred while parsing the text 'JGS'."
" Verse: 1:0, line: 1, character: 1, error: 'The book JGS is not a valid book id.",
):
list(text)


def test_usfm_file_text_corpus_lowercase_usfm_id() -> None:
corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH)

Expand Down
2 changes: 2 additions & 0 deletions tests/testutils/corpora_test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes"
USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target"
USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source"
USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id"
USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id"
USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes"
TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt"
CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs"
Expand Down
5 changes: 5 additions & 0 deletions tests/testutils/data/usfm/invalid_id/07JDG.SFM
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
\id JGS - Test
\h Judges
\mt Judges
\c 1
\v 1 Chapter one, verse one.
34 changes: 34 additions & 0 deletions tests/testutils/data/usfm/invalid_id/Settings.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<ScriptureText>
<StyleSheet>usfm.sty</StyleSheet>
<Versification>4</Versification>
<LanguageIsoCode>en:::</LanguageIsoCode>
<Language>English</Language>
<MinParatextVersion>8.0.100.76</MinParatextVersion>
<FullName>Test</FullName>
<Encoding>65001</Encoding>
<Editable>T</Editable>
<Copyright />
<NormalizationForm>NFC</NormalizationForm>
<Name>invalid_id</Name>
<Guid>a7e0b3ce0200736062f9f810a444dbfbe64aca35</Guid>
<DefaultFont>Charis SIL</DefaultFont>
<DefaultFontSize>12</DefaultFontSize>
<FontFeatures />
<HtmlLanguage />
<AssociatedLexicalProject />
<FileNameBookNameForm>41MAT</FileNameBookNameForm>
<FileNamePrePart />
<FileNamePostPart>.SFM</FileNamePostPart>
<BiblicalTermsListSetting>Major::BiblicalTerms.xml</BiblicalTermsListSetting>
<MatchBasedOnStems>F</MatchBasedOnStems>
<AllowReadAccess>F</AllowReadAccess>
<AllowSharingWithSLDR>F</AllowSharingWithSLDR>
<Visibility>Public</Visibility>
<TranslationInfo>Standard::</TranslationInfo>
<EncodingConverter />
<UsfmVersion>3</UsfmVersion>
<ParallelPassagesBooks>000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000</ParallelPassagesBooks>
<BooksPresent>000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000</BooksPresent>
<BibleModuleAssociations />
<Naming PrePart="" PostPart=".SFM" BookNameForm="41MAT" />
</ScriptureText>
31 changes: 31 additions & 0 deletions tests/testutils/data/usfm/invalid_id/custom.vrs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# custom.vrs

LEV 14:56
ROM 14:26
REV 12:17
TOB 5:22
TOB 10:12
SIR 23:28
ESG 1:22
ESG 3:15
ESG 5:14
ESG 8:17
ESG 10:14
SIR 33:33
SIR 41:24
BAR 1:22
4MA 7:25
4MA 12:20

# deliberately missing verses
-ROM 16:26
-ROM 16:27
-3JN 1:15
-S3Y 1:49
-ESG 4:6
-ESG 9:5
-ESG 9:30

LEV 14:55 = LEV 14:55
LEV 14:55 = LEV 14:56
LEV 14:56 = LEV 14:57
5 changes: 5 additions & 0 deletions tests/testutils/data/usfm/mismatch_id/07JDG.SFM
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
\id JUD - Test
\h Judges
\mt Judges
\c 1
\v 1 Chapter one, verse one.
34 changes: 34 additions & 0 deletions tests/testutils/data/usfm/mismatch_id/Settings.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<ScriptureText>
<StyleSheet>usfm.sty</StyleSheet>
<Versification>4</Versification>
<LanguageIsoCode>en:::</LanguageIsoCode>
<Language>English</Language>
<MinParatextVersion>8.0.100.76</MinParatextVersion>
<FullName>Test</FullName>
<Encoding>65001</Encoding>
<Editable>T</Editable>
<Copyright />
<NormalizationForm>NFC</NormalizationForm>
<Name>mismatch_id</Name>
<Guid>a7e0b3ce0200736062f9f810a444dbfbe64aca35</Guid>
<DefaultFont>Charis SIL</DefaultFont>
<DefaultFontSize>12</DefaultFontSize>
<FontFeatures />
<HtmlLanguage />
<AssociatedLexicalProject />
<FileNameBookNameForm>41MAT</FileNameBookNameForm>
<FileNamePrePart />
<FileNamePostPart>.SFM</FileNamePostPart>
<BiblicalTermsListSetting>Major::BiblicalTerms.xml</BiblicalTermsListSetting>
<MatchBasedOnStems>F</MatchBasedOnStems>
<AllowReadAccess>F</AllowReadAccess>
<AllowSharingWithSLDR>F</AllowSharingWithSLDR>
<Visibility>Public</Visibility>
<TranslationInfo>Standard::</TranslationInfo>
<EncodingConverter />
<UsfmVersion>3</UsfmVersion>
<ParallelPassagesBooks>000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000</ParallelPassagesBooks>
<BooksPresent>000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000</BooksPresent>
<BibleModuleAssociations />
<Naming PrePart="" PostPart=".SFM" BookNameForm="41MAT" />
</ScriptureText>
31 changes: 31 additions & 0 deletions tests/testutils/data/usfm/mismatch_id/custom.vrs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# custom.vrs

LEV 14:56
ROM 14:26
REV 12:17
TOB 5:22
TOB 10:12
SIR 23:28
ESG 1:22
ESG 3:15
ESG 5:14
ESG 8:17
ESG 10:14
SIR 33:33
SIR 41:24
BAR 1:22
4MA 7:25
4MA 12:20

# deliberately missing verses
-ROM 16:26
-ROM 16:27
-3JN 1:15
-S3Y 1:49
-ESG 4:6
-ESG 9:5
-ESG 9:30

LEV 14:55 = LEV 14:55
LEV 14:55 = LEV 14:56
LEV 14:56 = LEV 14:57

0 comments on commit bd8707f

Please sign in to comment.