Skip to content

Commit

Permalink
X3ML for MMM: Changes to scripts which create simplified XML for X3ML
Browse files Browse the repository at this point in the history
Fixes various bugs that show up in the logs when X3ML is run. Convert author birth/death dates to format required for CIDOC-CRM.
  • Loading branch information
andrew-morrison committed Jan 30, 2019
1 parent d0e62a1 commit f3b05c9
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 298 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
processing/html/
processing/solr/
processing/analysis/results/
processing/analysis/mmm/results/
processing/lib/
*.log
*.tmp
246 changes: 246 additions & 0 deletions processing/analysis/mmm/simplify-authorities-for-x3ml.xquery
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare option saxon:output "indent=yes";

declare variable $authorityfile as xs:string external;
declare variable $chunk as xs:integer external;
declare variable $numchunks as xs:integer external;

declare function local:logging($level as xs:string, $msg as xs:string, $values)
{
(: Trick XQuery into doing trace() to output message to STDERR but not insert it into the XML :)
substring(trace('', concat(upper-case($level), ' ', $msg, ' ', string-join($values, ' '), ' ')), 0, 0)
};

declare function local:dateConversion($datestring as xs:string, $begin as xs:boolean)
{
let $isbce := starts-with($datestring, '-')
let $datestring := if ($isbce) then substring($datestring, 2) else $datestring
let $year := if (string-length($datestring) eq 0) then '' else if (matches($datestring, '^\d\d\d\d$')) then $datestring else if (matches($datestring, '^\d\d\d$')) then concat('0', $datestring) else if (matches($datestring, '^\d\d\d\d')) then substring($datestring, 1, 4) else if (matches($datestring, '^\d\d\d')) then concat('0', substring($datestring, 1, 3)) else ''
let $year := if ($isbce) then concat('-', $year) else $year
let $month := if (matches($datestring, '^\d?\d\d\d-\d\d')) then substring(substring-after($datestring, '-'), 1, 2) else ''
let $day := if (matches($datestring, '^\d?\d\d\d-\d\d-\d\d')) then substring(substring-after(substring-after($datestring, '-'), '-'), 1, 2) else ''

return
if ($year ne '' and $month ne '' and $day ne '') then
if ($begin) then
xs:dateTime(concat($year, '-', $month, '-', $day, 'T00:00:00'))
else
xs:dateTime(concat($year, '-', $month, '-', $day, 'T23:59:59'))
else if ($year ne '' and $month ne '') then
if ($begin) then
xs:dateTime(concat($year, '-', $month, '-01', 'T00:00:00'))
else
let $nextmonth := xs:integer($month) + 1
return if ($nextmonth le 12) then
let $nextmonthstring := if ($nextmonth lt 10) then concat('0',xs:string($nextmonth)) else xs:string($nextmonth)
let $dayafter := xs:dateTime(concat($year, '-', $nextmonthstring, '-01', 'T00:00:00'))
return $dayafter - xs:dayTimeDuration('PT1S')
else
xs:dateTime(concat($year, '-', $month, '-31', 'T23:59:59'))
else if ($year ne '') then
if ($begin) then
xs:dateTime(concat($year, '-01-01', 'T00:00:00'))
else
xs:dateTime(concat($year, '-12-31', 'T23:59:59'))
else
(local:logging('error', 'Unreadable date format', $datestring), ())[2]
};

declare function local:extractDates($elem as element()) as element()*
{
(: Convert dates into to/from child elements needed for mapping to CIDOC-CRM :)
let $begindates := (
for $date in $elem/(@when|@notBefore|@from)/data()
return
local:dateConversion(normalize-space($date), true())
)
let $enddates := (
for $date in $elem/(@when|@notAfter|@to)/data()
return
local:dateConversion(normalize-space($date), false())
)
return
if (count($begindates) gt 0 and count($enddates) gt 0) then
(
<from>{ min($begindates) }</from>,
<to>{ max($enddates) }</to>
)
else if (count($begindates) gt 0) then
<from>{ min($begindates) }</from>
else if (count($enddates) gt 0) then
<to>{ max($enddates) }</to>
else
()
};

declare function local:links($elem as element()) as element()*
{
let $links := $elem/tei:note[@type='links']//tei:ref[starts-with(@target, 'http')]
return
if (count($links) gt 0) then
<note type="links">
<list>
{
for $l in $links
return
<item>
<ref target="{ $l/@target }">{ normalize-space($l/string()) }</ref>
</item>
}
</list>
</note>
else ()
};

declare function local:otherNotes($elem as element()) as element()*
{
let $notes := ($elem/tei:note[not(@type='links')], $elem/ancestor::tei:*[starts-with(local-name(), 'list')]/tei:head/tei:note)
for $n in $notes
return
element { 'note' } { $n/@*, normalize-space($n/string()) }
};

declare variable $authorityentries as element()* := (
for $e at $pos in doc($authorityfile)//(tei:bibl|tei:person|tei:place|tei:org)[@xml:id and not(ancestor::tei:bibl or ancestor::tei:person or ancestor::tei:place or ancestor::tei:org)]
return
if ($pos mod $numchunks = $chunk) then $e else ()
);

<TEI>
<text>
<body>
{
(
if (count($authorityentries[self::tei:bibl]) gt 0) then
<listBibl>
{
for $e in $authorityentries[self::tei:bibl]
return
<bibl xml:id="{ $e/@xml:id }">
<title type="uniform">{ normalize-space($e/tei:title[@type='uniform'][1]/string()) }</title>
{
let $variants as xs:string* := for $t in $e/tei:title[not(@type='uniform')] return normalize-space($t/string())
for $v in distinct-values($variants)[string-length(.) gt 0]
return
<title type="variant">{ $v }</title>
}
{
for $n in $e/(tei:author|tei:textLang)
return
element { local-name($n) } { $n/@*, normalize-space($n/string()) }
}
{
local:links($e)
}
{
local:otherNotes($e)
}
</bibl>
}
</listBibl>
else (),
if (count($authorityentries[self::tei:person]) gt 0) then
<listPerson>
{
for $e in $authorityentries[self::tei:person]
return
<person xml:id="{ $e/@xml:id }">
<persName type="display">{ normalize-space($e/tei:persName[@type='display'][1]/string()) }</persName>
{
let $variants as xs:string* := for $p in $e/tei:persName[not(@type='display')] return normalize-space($p/string())
for $v in distinct-values($variants)[string-length(.) gt 0]
return
<persName type="variant">{ $v }</persName>
}
{
for $d in $e/(tei:birth|tei:death)
return
element { local-name($d) } {
local:extractDates($d)
}
}
{
local:links($e)
}
{
local:otherNotes($e)
}
</person>
}
</listPerson>
else (),
if (count($authorityentries[self::tei:place]) gt 0) then
<listPlace>
{
for $e in $authorityentries[self::tei:place]
return
<place xml:id="{ $e/@xml:id }">
<placeName type="index">{ normalize-space($e/tei:placeName[@type='index'][1]/string()) }</placeName>
{
let $variants as xs:string* := for $p in $e/tei:placeName[not(@type='index')] return normalize-space($p/string())
for $v in distinct-values($variants)[string-length(.) gt 0]
return
<placeName type="variant">{ $v }</placeName>
}
{
for $n in $e/(tei:country|tei:location)
return
element { local-name($n) } { $n/@*, normalize-space($n/string()) }
}
{
if ($e/tei:location/tei:geo) then
<location>
<geo>{ normalize-space($e/tei:location/tei:geo[1]/string()) }</geo>
</location>
else ()
}
{
local:links($e)
}
{
local:otherNotes($e)
}
</place>
}
</listPlace>
else (),
if (count($authorityentries[self::tei:org]) gt 0) then
<listOrg>
{
for $e in $authorityentries[self::tei:org]
return
<org xml:id="{ $e/@xml:id }">
<orgName type="display">{ normalize-space($e/tei:orgName[@type='display'][1]/string()) }</orgName>
{
let $variants as xs:string* := for $o in $e/tei:orgName[not(@type='display')] return normalize-space($o/string())
for $v in distinct-values($variants)[string-length(.) gt 0]
return
<orgName type="variant">{ $v }</orgName>
}
{
for $n in $e/tei:country
return
element { local-name($n) } { $n/@*, normalize-space($n/string()) }
}
{
if ($e/tei:location/tei:geo) then
<location>
<geo>{ normalize-space($e/tei:location/tei:geo[1]/string()) }</geo>
</location>
else ()
}
{
local:links($e)
}
{
local:otherNotes($e)
}
</org>
}
</listOrg>
else ()
)
}
</body>
</text>
</TEI>
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare option saxon:output "indent=yes";

declare variable $collectionsfolder as xs:string external;
declare variable $chunk as xs:integer external;
declare variable $numchunks as xs:integer external;

declare variable $website := 'https://medieval.bodleian.ox.ac.uk';

Expand Down Expand Up @@ -83,11 +85,12 @@ declare function local:languageCodeLookup($lang as xs:string) as xs:string*
case 'gd' return ('Gaelic', 'http://vocab.getty.edu/aat/300388323')
case 'fy' return ('Frisian', 'http://vocab.getty.edu/aat/300388308')
case 'dlm' return ('Dalmatian', 'http://vocab.getty.edu/aat/300388199')
case 'cai' return ('Central American Indian', 'http://vocab.getty.edu/aat/300388079')
case 'zxx' return ()
default return (local:logging('error', 'Unrecognized language code', $lang), ())[2]
};

declare function local:dateConversion($datestring as xs:string, $begin as xs:boolean) as xs:dateTime*
declare function local:dateConversion($datestring as xs:string, $begin as xs:boolean)
{
let $isbce := starts-with($datestring, '-')
let $datestring := if ($isbce) then substring($datestring, 2) else $datestring
Expand Down Expand Up @@ -126,7 +129,7 @@ declare function local:listItems($manuscript as element(tei:TEI), $mscontent as
{
for $msItem in $mscontent//tei:msItem[tei:title/@key]
(: This flattens all works (msItem) into one list. In the TEI, there can be a hierarchy of works-within-works. :)
let $itemid := $msItem/@xml:id/data()
let $itemid := ($msItem/@xml:id/data(), generate-id($msItem))[1]
let $workid := ($msItem/tei:title/@key)[1]/data()
return
<item>
Expand Down Expand Up @@ -240,7 +243,7 @@ declare function local:extractPhysicalFields($physdesc as element(tei:physDesc)?

declare function local:extractDates($history as element(tei:history)?) as element()*
{
(: Simply dates to a single range for each type. This means losing some detail
(: Simplify dates to a single range for each type. This means losing some detail
(e.g. text written in one century, illustrations in the next.) but that would
be difficult to model in 3M anyway. :)
(
Expand Down Expand Up @@ -401,31 +404,31 @@ declare function local:listDigitizedCopies($surrogates as element(tei:surrogates
<digitalimages>{ $ref/@target/data() }</digitalimages>
};

processing-instruction xml-model {'href="https://raw.githubusercontent.com/bodleian/medieval-mss/master/processing/analysis/simplified4oxlod.xsd" type="application/xml" schematypens="http://www.w3.org/2001/XMLSchema"'},
<manuscripts>
{
for $manuscript at $pos in collection('../../collections/?select=*.xml;recurse=yes')/tei:TEI
for $manuscript at $pos in collection(concat($collectionsfolder, '/?select=*.xml;recurse=yes'))/tei:TEI

return if ($pos mod 20 = $chunk) then
(:
To process everything, change above line to: return if (true()) then
To get a small random-ish sample, use: return if ($pos mod 200 = 0) then
:)
return if ($pos mod $numchunks = $chunk) then

<manuscript>

<uri>{ $website }/catalog/{ $manuscript/@xml:id/data() }</uri>
<classmark>{ $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:idno[1]/text() }</classmark>
<collection>{ $manuscript/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type="collection"]/text() }</collection>
<repository>{ $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:repository/text() }</repository>
<institution>{ $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:institution/text() }</institution>

{
if ($manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:institution) then
<institution>{ $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:institution/text() }</institution>
else
()
}
{
(: All manuscripts have at least one msContents element, but in multi-part manuscripts each is a child of an msPart :)
for $mscontent in $manuscript//tei:msContents
return
if ($mscontent/parent::tei:msPart) then
<part>
<uri>{ $website }/catalog/{ $manuscript/@xml:id/data() }#{ $mscontent/parent::tei:msPart/@xml:id/data() }</uri>
<uri>{ $website }/catalog/{ $manuscript/@xml:id/data() }#{ ($mscontent/parent::tei:msPart/@xml:id/data(), generate-id($mscontent/parent::tei:msPart))[1] }</uri>
<label>{ $mscontent/parent::tei:msPart/tei:msIdentifier[1]/tei:altIdentifier[1]/tei:idno[1]/text() }</label>
{ local:extractPhysicalFields($mscontent/parent::tei:msPart/tei:physDesc) }
{ local:extractDates($mscontent/parent::tei:msPart/tei:history) }
Expand Down
29 changes: 29 additions & 0 deletions processing/analysis/mmm/simplify-tei-for-x3ml.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env bash

# Create subfolder to keep generated files out of GitHub
if [ ! -d "results" ]; then
mkdir results
fi

LOGFILE="results/simplify-tei-for-x3ml.log"

date > $LOGFILE

# Generate the simplied XML, extracting the desired information from the ~10K TEI records for each manuscript,
# Also output to 20 chunks, to avoids memory issues when they are subsequently processed by x3ml
for i in {0..19}
do
java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-records-for-x3ml.xquery -o:results/manuscripts_chunk$i.xml collectionsfolder="../../../collections" chunk=$i numchunks=20 2>> $LOGFILE
done

# Strip out namespaces from authority files, which x3ml cannot handle no matter how I try to declare it
# in the mapping file. Also chunk persons and works, again so the x3ml can process them in minutes rather than hours.
for i in {0..4}
do
java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-authorities-for-x3ml.xquery -o:results/persons_chunk$i.xml authorityfile="../../../persons.xml" chunk=$i numchunks=5 2>> $LOGFILE
done
for i in {0..4}
do
java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-authorities-for-x3ml.xquery -o:results/works_chunk$i.xml authorityfile="../../../works.xml" chunk=$i numchunks=5 2>> $LOGFILE
done
java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-authorities-for-x3ml.xquery -o:results/places.xml authorityfile="../../../places.xml" chunk=0 numchunks=1 2>> $LOGFILE
14 changes: 0 additions & 14 deletions processing/analysis/simplified-xml-for-3m.sh

This file was deleted.

Loading

0 comments on commit f3b05c9

Please sign in to comment.