From f3b05c988636c91fc5140e82fd3f17897d1f0da0 Mon Sep 17 00:00:00 2001 From: Andrew Morrison Date: Wed, 30 Jan 2019 12:19:54 +0000 Subject: [PATCH] X3ML for MMM: Changes to scripts which create simplified XML for X3ML Fixes various bugs that show up in the logs when X3ML is run. Convert author birth/death dates to format required for CIDOC-CRM. --- .gitignore | 1 + .../mmm/simplify-authorities-for-x3ml.xquery | 246 ++++++++++++++++ .../simplify-records-for-x3ml.xquery} | 29 +- .../analysis/mmm/simplify-tei-for-x3ml.sh | 29 ++ processing/analysis/simplified-xml-for-3m.sh | 14 - processing/analysis/simplified4oxlod.xsd | 271 ------------------ 6 files changed, 292 insertions(+), 298 deletions(-) create mode 100644 processing/analysis/mmm/simplify-authorities-for-x3ml.xquery rename processing/analysis/{simplified-xml-for-3m.xquery => mmm/simplify-records-for-x3ml.xquery} (95%) create mode 100755 processing/analysis/mmm/simplify-tei-for-x3ml.sh delete mode 100755 processing/analysis/simplified-xml-for-3m.sh delete mode 100644 processing/analysis/simplified4oxlod.xsd diff --git a/.gitignore b/.gitignore index 3428fe8e33..37c249fdd3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ processing/html/ processing/solr/ processing/analysis/results/ +processing/analysis/mmm/results/ processing/lib/ *.log *.tmp diff --git a/processing/analysis/mmm/simplify-authorities-for-x3ml.xquery b/processing/analysis/mmm/simplify-authorities-for-x3ml.xquery new file mode 100644 index 0000000000..d5c2549ab1 --- /dev/null +++ b/processing/analysis/mmm/simplify-authorities-for-x3ml.xquery @@ -0,0 +1,246 @@ +declare namespace tei="http://www.tei-c.org/ns/1.0"; +declare option saxon:output "indent=yes"; + +declare variable $authorityfile as xs:string external; +declare variable $chunk as xs:integer external; +declare variable $numchunks as xs:integer external; + +declare function local:logging($level as xs:string, $msg as xs:string, $values) +{ + (: Trick XQuery into doing trace() to output message to STDERR but not insert it into the XML :) + substring(trace('', concat(upper-case($level), ' ', $msg, ' ', string-join($values, ' '), ' ')), 0, 0) +}; + +declare function local:dateConversion($datestring as xs:string, $begin as xs:boolean) +{ + let $isbce := starts-with($datestring, '-') + let $datestring := if ($isbce) then substring($datestring, 2) else $datestring + let $year := if (string-length($datestring) eq 0) then '' else if (matches($datestring, '^\d\d\d\d$')) then $datestring else if (matches($datestring, '^\d\d\d$')) then concat('0', $datestring) else if (matches($datestring, '^\d\d\d\d')) then substring($datestring, 1, 4) else if (matches($datestring, '^\d\d\d')) then concat('0', substring($datestring, 1, 3)) else '' + let $year := if ($isbce) then concat('-', $year) else $year + let $month := if (matches($datestring, '^\d?\d\d\d-\d\d')) then substring(substring-after($datestring, '-'), 1, 2) else '' + let $day := if (matches($datestring, '^\d?\d\d\d-\d\d-\d\d')) then substring(substring-after(substring-after($datestring, '-'), '-'), 1, 2) else '' + + return + if ($year ne '' and $month ne '' and $day ne '') then + if ($begin) then + xs:dateTime(concat($year, '-', $month, '-', $day, 'T00:00:00')) + else + xs:dateTime(concat($year, '-', $month, '-', $day, 'T23:59:59')) + else if ($year ne '' and $month ne '') then + if ($begin) then + xs:dateTime(concat($year, '-', $month, '-01', 'T00:00:00')) + else + let $nextmonth := xs:integer($month) + 1 + return if ($nextmonth le 12) then + let $nextmonthstring := if ($nextmonth lt 10) then concat('0',xs:string($nextmonth)) else xs:string($nextmonth) + let $dayafter := xs:dateTime(concat($year, '-', $nextmonthstring, '-01', 'T00:00:00')) + return $dayafter - xs:dayTimeDuration('PT1S') + else + xs:dateTime(concat($year, '-', $month, '-31', 'T23:59:59')) + else if ($year ne '') then + if ($begin) then + xs:dateTime(concat($year, '-01-01', 'T00:00:00')) + else + xs:dateTime(concat($year, '-12-31', 'T23:59:59')) + else + (local:logging('error', 'Unreadable date format', $datestring), ())[2] +}; + +declare function local:extractDates($elem as element()) as element()* +{ + (: Convert dates into to/from child elements needed for mapping to CIDOC-CRM :) + let $begindates := ( + for $date in $elem/(@when|@notBefore|@from)/data() + return + local:dateConversion(normalize-space($date), true()) + ) + let $enddates := ( + for $date in $elem/(@when|@notAfter|@to)/data() + return + local:dateConversion(normalize-space($date), false()) + ) + return + if (count($begindates) gt 0 and count($enddates) gt 0) then + ( + { min($begindates) }, + { max($enddates) } + ) + else if (count($begindates) gt 0) then + { min($begindates) } + else if (count($enddates) gt 0) then + { max($enddates) } + else + () +}; + +declare function local:links($elem as element()) as element()* +{ +let $links := $elem/tei:note[@type='links']//tei:ref[starts-with(@target, 'http')] +return +if (count($links) gt 0) then + + + { + for $l in $links + return + + { normalize-space($l/string()) } + + } + + +else () +}; + +declare function local:otherNotes($elem as element()) as element()* +{ +let $notes := ($elem/tei:note[not(@type='links')], $elem/ancestor::tei:*[starts-with(local-name(), 'list')]/tei:head/tei:note) +for $n in $notes + return + element { 'note' } { $n/@*, normalize-space($n/string()) } +}; + +declare variable $authorityentries as element()* := ( + for $e at $pos in doc($authorityfile)//(tei:bibl|tei:person|tei:place|tei:org)[@xml:id and not(ancestor::tei:bibl or ancestor::tei:person or ancestor::tei:place or ancestor::tei:org)] + return + if ($pos mod $numchunks = $chunk) then $e else () + ); + + + + + { + ( + if (count($authorityentries[self::tei:bibl]) gt 0) then + + { + for $e in $authorityentries[self::tei:bibl] + return + + { normalize-space($e/tei:title[@type='uniform'][1]/string()) } + { + let $variants as xs:string* := for $t in $e/tei:title[not(@type='uniform')] return normalize-space($t/string()) + for $v in distinct-values($variants)[string-length(.) gt 0] + return + { $v } + } + { + for $n in $e/(tei:author|tei:textLang) + return + element { local-name($n) } { $n/@*, normalize-space($n/string()) } + } + { + local:links($e) + } + { + local:otherNotes($e) + } + + } + + else (), + if (count($authorityentries[self::tei:person]) gt 0) then + + { + for $e in $authorityentries[self::tei:person] + return + + { normalize-space($e/tei:persName[@type='display'][1]/string()) } + { + let $variants as xs:string* := for $p in $e/tei:persName[not(@type='display')] return normalize-space($p/string()) + for $v in distinct-values($variants)[string-length(.) gt 0] + return + { $v } + } + { + for $d in $e/(tei:birth|tei:death) + return + element { local-name($d) } { + local:extractDates($d) + } + } + { + local:links($e) + } + { + local:otherNotes($e) + } + + } + + else (), + if (count($authorityentries[self::tei:place]) gt 0) then + + { + for $e in $authorityentries[self::tei:place] + return + + { normalize-space($e/tei:placeName[@type='index'][1]/string()) } + { + let $variants as xs:string* := for $p in $e/tei:placeName[not(@type='index')] return normalize-space($p/string()) + for $v in distinct-values($variants)[string-length(.) gt 0] + return + { $v } + } + { + for $n in $e/(tei:country|tei:location) + return + element { local-name($n) } { $n/@*, normalize-space($n/string()) } + } + { + if ($e/tei:location/tei:geo) then + + { normalize-space($e/tei:location/tei:geo[1]/string()) } + + else () + } + { + local:links($e) + } + { + local:otherNotes($e) + } + + } + + else (), + if (count($authorityentries[self::tei:org]) gt 0) then + + { + for $e in $authorityentries[self::tei:org] + return + + { normalize-space($e/tei:orgName[@type='display'][1]/string()) } + { + let $variants as xs:string* := for $o in $e/tei:orgName[not(@type='display')] return normalize-space($o/string()) + for $v in distinct-values($variants)[string-length(.) gt 0] + return + { $v } + } + { + for $n in $e/tei:country + return + element { local-name($n) } { $n/@*, normalize-space($n/string()) } + } + { + if ($e/tei:location/tei:geo) then + + { normalize-space($e/tei:location/tei:geo[1]/string()) } + + else () + } + { + local:links($e) + } + { + local:otherNotes($e) + } + + } + + else () + ) + } + + + diff --git a/processing/analysis/simplified-xml-for-3m.xquery b/processing/analysis/mmm/simplify-records-for-x3ml.xquery similarity index 95% rename from processing/analysis/simplified-xml-for-3m.xquery rename to processing/analysis/mmm/simplify-records-for-x3ml.xquery index bd41651aff..5d630e1779 100644 --- a/processing/analysis/simplified-xml-for-3m.xquery +++ b/processing/analysis/mmm/simplify-records-for-x3ml.xquery @@ -1,7 +1,9 @@ declare namespace tei="http://www.tei-c.org/ns/1.0"; declare option saxon:output "indent=yes"; +declare variable $collectionsfolder as xs:string external; declare variable $chunk as xs:integer external; +declare variable $numchunks as xs:integer external; declare variable $website := 'https://medieval.bodleian.ox.ac.uk'; @@ -83,11 +85,12 @@ declare function local:languageCodeLookup($lang as xs:string) as xs:string* case 'gd' return ('Gaelic', 'http://vocab.getty.edu/aat/300388323') case 'fy' return ('Frisian', 'http://vocab.getty.edu/aat/300388308') case 'dlm' return ('Dalmatian', 'http://vocab.getty.edu/aat/300388199') + case 'cai' return ('Central American Indian', 'http://vocab.getty.edu/aat/300388079') case 'zxx' return () default return (local:logging('error', 'Unrecognized language code', $lang), ())[2] }; -declare function local:dateConversion($datestring as xs:string, $begin as xs:boolean) as xs:dateTime* +declare function local:dateConversion($datestring as xs:string, $begin as xs:boolean) { let $isbce := starts-with($datestring, '-') let $datestring := if ($isbce) then substring($datestring, 2) else $datestring @@ -126,7 +129,7 @@ declare function local:listItems($manuscript as element(tei:TEI), $mscontent as { for $msItem in $mscontent//tei:msItem[tei:title/@key] (: This flattens all works (msItem) into one list. In the TEI, there can be a hierarchy of works-within-works. :) - let $itemid := $msItem/@xml:id/data() + let $itemid := ($msItem/@xml:id/data(), generate-id($msItem))[1] let $workid := ($msItem/tei:title/@key)[1]/data() return @@ -240,7 +243,7 @@ declare function local:extractPhysicalFields($physdesc as element(tei:physDesc)? declare function local:extractDates($history as element(tei:history)?) as element()* { - (: Simply dates to a single range for each type. This means losing some detail + (: Simplify dates to a single range for each type. This means losing some detail (e.g. text written in one century, illustrations in the next.) but that would be difficult to model in 3M anyway. :) ( @@ -401,31 +404,31 @@ declare function local:listDigitizedCopies($surrogates as element(tei:surrogates { $ref/@target/data() } }; -processing-instruction xml-model {'href="https://raw.githubusercontent.com/bodleian/medieval-mss/master/processing/analysis/simplified4oxlod.xsd" type="application/xml" schematypens="http://www.w3.org/2001/XMLSchema"'}, { - for $manuscript at $pos in collection('../../collections/?select=*.xml;recurse=yes')/tei:TEI + for $manuscript at $pos in collection(concat($collectionsfolder, '/?select=*.xml;recurse=yes'))/tei:TEI - return if ($pos mod 20 = $chunk) then - (: - To process everything, change above line to: return if (true()) then - To get a small random-ish sample, use: return if ($pos mod 200 = 0) then - :) + return if ($pos mod $numchunks = $chunk) then + { $website }/catalog/{ $manuscript/@xml:id/data() } { $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:idno[1]/text() } { $manuscript/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type="collection"]/text() } { $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:repository/text() } - { $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:institution/text() } - + { + if ($manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:institution) then + { $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:institution/text() } + else + () + } { (: All manuscripts have at least one msContents element, but in multi-part manuscripts each is a child of an msPart :) for $mscontent in $manuscript//tei:msContents return if ($mscontent/parent::tei:msPart) then - { $website }/catalog/{ $manuscript/@xml:id/data() }#{ $mscontent/parent::tei:msPart/@xml:id/data() } + { $website }/catalog/{ $manuscript/@xml:id/data() }#{ ($mscontent/parent::tei:msPart/@xml:id/data(), generate-id($mscontent/parent::tei:msPart))[1] } { local:extractPhysicalFields($mscontent/parent::tei:msPart/tei:physDesc) } { local:extractDates($mscontent/parent::tei:msPart/tei:history) } diff --git a/processing/analysis/mmm/simplify-tei-for-x3ml.sh b/processing/analysis/mmm/simplify-tei-for-x3ml.sh new file mode 100755 index 0000000000..a9310d7a91 --- /dev/null +++ b/processing/analysis/mmm/simplify-tei-for-x3ml.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# Create subfolder to keep generated files out of GitHub +if [ ! -d "results" ]; then + mkdir results +fi + +LOGFILE="results/simplify-tei-for-x3ml.log" + +date > $LOGFILE + +# Generate the simplied XML, extracting the desired information from the ~10K TEI records for each manuscript, +# Also output to 20 chunks, to avoids memory issues when they are subsequently processed by x3ml +for i in {0..19} +do + java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-records-for-x3ml.xquery -o:results/manuscripts_chunk$i.xml collectionsfolder="../../../collections" chunk=$i numchunks=20 2>> $LOGFILE +done + +# Strip out namespaces from authority files, which x3ml cannot handle no matter how I try to declare it +# in the mapping file. Also chunk persons and works, again so the x3ml can process them in minutes rather than hours. +for i in {0..4} +do + java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-authorities-for-x3ml.xquery -o:results/persons_chunk$i.xml authorityfile="../../../persons.xml" chunk=$i numchunks=5 2>> $LOGFILE +done +for i in {0..4} +do + java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-authorities-for-x3ml.xquery -o:results/works_chunk$i.xml authorityfile="../../../works.xml" chunk=$i numchunks=5 2>> $LOGFILE +done +java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-authorities-for-x3ml.xquery -o:results/places.xml authorityfile="../../../places.xml" chunk=0 numchunks=1 2>> $LOGFILE diff --git a/processing/analysis/simplified-xml-for-3m.sh b/processing/analysis/simplified-xml-for-3m.sh deleted file mode 100755 index f2b74c4425..0000000000 --- a/processing/analysis/simplified-xml-for-3m.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -# Create subfolder to keep generated files out of GitHub -if [ ! -d "results" ]; then - mkdir results -fi - -LOGFILE="results/simplified-xml-for-3m.log" - -# Generate the simplied XML, outputting to 20 chunks, which avoids memory issues when they will be processed by x3ml -for i in {0..19} -do - java -Xmx1G -cp ../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplified-xml-for-3m.xquery -o:results/simplified-xml-for-3m-chunk$i.xml chunk=$i 2>> $LOGFILE -done \ No newline at end of file diff --git a/processing/analysis/simplified4oxlod.xsd b/processing/analysis/simplified4oxlod.xsd deleted file mode 100644 index 4726a0df86..0000000000 --- a/processing/analysis/simplified4oxlod.xsd +++ /dev/null @@ -1,271 +0,0 @@ - - - - - - - This schema specifies an intermediate metadata format for extracting the essentials from a TEI header - in a predictable structure for easier mapping to RDF in the 3M editor for use by OXLOD, - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -