processing/analysis/mmm/simplify-authorities-for-x3ml.xquery
processing/analysis/mmm/simplify-authorities-for-x3ml.xquery
@@ -0,0 +1,246 @@
+declare namespace tei="http://www.tei-c.org/ns/1.0";
+declare option saxon:output "indent=yes";
+declare variable $authorityfile as xs:string external;
+declare variable $chunk as xs:integer external;
+declare variable $numchunks as xs:integer external;
+declare function local:logging($level as xs:string, $msg as xs:string, $values)
+ (: Trick XQuery into doing trace() to output message to STDERR but not insert it into the XML :)
+ substring(trace('', concat(upper-case($level), ' ', $msg, ' ', string-join($values, ' '), ' ')), 0, 0)
+declare function local:dateConversion($datestring as xs:string, $begin as xs:boolean)
+ let $isbce := starts-with($datestring, '-')
+ let $datestring := if ($isbce) then substring($datestring, 2) else $datestring
+ let $year := if (string-length($datestring) eq 0) then '' else if (matches($datestring, '^\d\d\d\d$')) then $datestring else if (matches($datestring, '^\d\d\d$')) then concat('0', $datestring) else if (matches($datestring, '^\d\d\d\d')) then substring($datestring, 1, 4) else if (matches($datestring, '^\d\d\d')) then concat('0', substring($datestring, 1, 3)) else ''
+ let $year := if ($isbce) then concat('-', $year) else $year
+ let $month := if (matches($datestring, '^\d?\d\d\d-\d\d')) then substring(substring-after($datestring, '-'), 1, 2) else ''
+ let $day := if (matches($datestring, '^\d?\d\d\d-\d\d-\d\d')) then substring(substring-after(substring-after($datestring, '-'), '-'), 1, 2) else ''
+ return
+ if ($year ne '' and $month ne '' and $day ne '') then
+ if ($begin) then
+ xs:dateTime(concat($year, '-', $month, '-', $day, 'T00:00:00'))
+ else
+ xs:dateTime(concat($year, '-', $month, '-', $day, 'T23:59:59'))
+ else if ($year ne '' and $month ne '') then
+ if ($begin) then
+ xs:dateTime(concat($year, '-', $month, '-01', 'T00:00:00'))
+ else
+ let $nextmonth := xs:integer($month) + 1
+ return if ($nextmonth le 12) then
+ let $nextmonthstring := if ($nextmonth lt 10) then concat('0',xs:string($nextmonth)) else xs:string($nextmonth)
+ let $dayafter := xs:dateTime(concat($year, '-', $nextmonthstring, '-01', 'T00:00:00'))
+ return $dayafter - xs:dayTimeDuration('PT1S')
+ else
+ xs:dateTime(concat($year, '-', $month, '-31', 'T23:59:59'))
+ else if ($year ne '') then
+ if ($begin) then
+ xs:dateTime(concat($year, '-01-01', 'T00:00:00'))
+ else
+ xs:dateTime(concat($year, '-12-31', 'T23:59:59'))
+ else
+ (local:logging('error', 'Unreadable date format', $datestring), ())[2]
+declare function local:extractDates($elem as element()) as element()*
+ (: Convert dates into to/from child elements needed for mapping to CIDOC-CRM :)
+ let $begindates := (
+ for $date in $elem/(@when|@notBefore|@from)/data()
+ return
+ local:dateConversion(normalize-space($date), true())
+ )
+ let $enddates := (
+ for $date in $elem/(@when|@notAfter|@to)/data()
+ return
+ local:dateConversion(normalize-space($date), false())
+ )
+ return
+ if (count($begindates) gt 0 and count($enddates) gt 0) then
+ (
+ { min($begindates) },
+ { max($enddates) }
+ )
+ else if (count($begindates) gt 0) then
+ { min($begindates) }
+ else if (count($enddates) gt 0) then
+ { max($enddates) }
+ else
+ ()
+declare function local:links($elem as element()) as element()*
+let $links := $elem/tei:note[@type='links']//tei:ref[starts-with(@target, 'http')]
+if (count($links) gt 0) then
+ {
+ for $l in $links
+ return
+ -
[{ normalize-space($l/string()) }]({ $l/@target })
+ }
+else ()
+declare function local:otherNotes($elem as element()) as element()*
+let $notes := ($elem/tei:note[not(@type='links')], $elem/ancestor::tei:*[starts-with(local-name(), 'list')]/tei:head/tei:note)
+for $n in $notes
+ return
+ element { 'note' } { $n/@*, normalize-space($n/string()) }
+declare variable $authorityentries as element()* := (
+ for $e at $pos in doc($authorityfile)//(tei:bibl|tei:person|tei:place|tei:org)[@xml:id and not(ancestor::tei:bibl or ancestor::tei:person or ancestor::tei:place or ancestor::tei:org)]
+ return
+ if ($pos mod $numchunks = $chunk) then $e else ()
+ );
+ {
+ (
+ if (count($authorityentries[self::tei:bibl]) gt 0) then
+ {
+ for $e in $authorityentries[self::tei:bibl]
+ return
+ { normalize-space($e/tei:title[@type='uniform'][1]/string()) }
+ {
+ let $variants as xs:string* := for $t in $e/tei:title[not(@type='uniform')] return normalize-space($t/string())
+ for $v in distinct-values($variants)[string-length(.) gt 0]
+ return
+ { $v }
+ }
+ {
+ for $n in $e/(tei:author|tei:textLang)
+ return
+ element { local-name($n) } { $n/@*, normalize-space($n/string()) }
+ }
+ {
+ local:links($e)
+ }
+ {
+ local:otherNotes($e)
+ }
+ }
+ else (),
+ if (count($authorityentries[self::tei:person]) gt 0) then
+ {
+ for $e in $authorityentries[self::tei:person]
+ return
+ { normalize-space($e/tei:persName[@type='display'][1]/string()) }
+ {
+ let $variants as xs:string* := for $p in $e/tei:persName[not(@type='display')] return normalize-space($p/string())
+ for $v in distinct-values($variants)[string-length(.) gt 0]
+ return
+ { $v }
+ }
+ {
+ for $d in $e/(tei:birth|tei:death)
+ return
+ element { local-name($d) } {
+ local:extractDates($d)
+ }
+ }
+ {
+ local:links($e)
+ }
+ {
+ local:otherNotes($e)
+ }
+ }
+ else (),
+ if (count($authorityentries[self::tei:place]) gt 0) then
+ {
+ for $e in $authorityentries[self::tei:place]
+ return
+ { normalize-space($e/tei:placeName[@type='index'][1]/string()) }
+ {
+ let $variants as xs:string* := for $p in $e/tei:placeName[not(@type='index')] return normalize-space($p/string())
+ for $v in distinct-values($variants)[string-length(.) gt 0]
+ return
+ { $v }
+ }
+ {
+ for $n in $e/(tei:country|tei:location)
+ return
+ element { local-name($n) } { $n/@*, normalize-space($n/string()) }
+ }
+ {
+ if ($e/tei:location/tei:geo) then
+ { normalize-space($e/tei:location/tei:geo[1]/string()) }
+ else ()
+ }
+ {
+ local:links($e)
+ }
+ {
+ local:otherNotes($e)
+ }
+ }
+ else (),
+ if (count($authorityentries[self::tei:org]) gt 0) then
+ {
+ for $e in $authorityentries[self::tei:org]
+ return
+ { normalize-space($e/tei:orgName[@type='display'][1]/string()) }
+ {
+ let $variants as xs:string* := for $o in $e/tei:orgName[not(@type='display')] return normalize-space($o/string())
+ for $v in distinct-values($variants)[string-length(.) gt 0]
+ return
+ { $v }
+ }
+ {
+ for $n in $e/tei:country
+ return
+ element { local-name($n) } { $n/@*, normalize-space($n/string()) }
+ }
+ {
+ if ($e/tei:location/tei:geo) then
+ { normalize-space($e/tei:location/tei:geo[1]/string()) }
+ else ()
+ }
+ {
+ local:links($e)
+ }
+ {
+ local:otherNotes($e)
+ }
+ }
+ else ()
+ )
+ }
processing/analysis/simplified-xml-for-3m.xquery renamed to processing/analysis/mmm/simplify-records-for-x3ml.xquery
rename from processing/analysis/simplified-xml-for-3m.xquery
rename to processing/analysis/mmm/simplify-records-for-x3ml.xquery
--- a/processing/analysis/simplified-xml-for-3m.xquery
+++ b/processing/analysis/mmm/simplify-records-for-x3ml.xquery
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare option saxon:output "indent=yes";
+declare variable $collectionsfolder as xs:string external;
declare variable $chunk as xs:integer external;
+declare variable $numchunks as xs:integer external;
declare variable $website := 'https://medieval.bodleian.ox.ac.uk';
@@ -83,11 +85,12 @@ declare function local:languageCodeLookup($lang as xs:string) as xs:string*
case 'gd' return ('Gaelic', 'http://vocab.getty.edu/aat/300388323')
case 'fy' return ('Frisian', 'http://vocab.getty.edu/aat/300388308')
case 'dlm' return ('Dalmatian', 'http://vocab.getty.edu/aat/300388199')
+ case 'cai' return ('Central American Indian', 'http://vocab.getty.edu/aat/300388079')
case 'zxx' return ()
default return (local:logging('error', 'Unrecognized language code', $lang), ())[2]
-declare function local:dateConversion($datestring as xs:string, $begin as xs:boolean) as xs:dateTime*
+declare function local:dateConversion($datestring as xs:string, $begin as xs:boolean)
let $isbce := starts-with($datestring, '-')
let $datestring := if ($isbce) then substring($datestring, 2) else $datestring
@@ -126,7 +129,7 @@ declare function local:listItems($manuscript as element(tei:TEI), $mscontent as
for $msItem in $mscontent//tei:msItem[tei:title/@key]
(: This flattens all works (msItem) into one list. In the TEI, there can be a hierarchy of works-within-works. :)
- let $itemid := $msItem/@xml:id/data()
+ let $itemid := ($msItem/@xml:id/data(), generate-id($msItem))[1]
let $workid := ($msItem/tei:title/@key)[1]/data()
@@ -240,7 +243,7 @@ declare function local:extractPhysicalFields($physdesc as element(tei:physDesc)?
declare function local:extractDates($history as element(tei:history)?) as element()*
- (: Simply dates to a single range for each type. This means losing some detail
+ (: Simplify dates to a single range for each type. This means losing some detail
(e.g. text written in one century, illustrations in the next.) but that would
be difficult to model in 3M anyway. :)
@@ -401,31 +404,31 @@ declare function local:listDigitizedCopies($surrogates as element(tei:surrogates
{ $ref/@target/data() }
-processing-instruction xml-model {'href="https://raw.githubusercontent.com/bodleian/medieval-mss/master/processing/analysis/simplified4oxlod.xsd" type="application/xml" schematypens="http://www.w3.org/2001/XMLSchema"'},
- for $manuscript at $pos in collection('../../collections/?select=*.xml;recurse=yes')/tei:TEI
+ for $manuscript at $pos in collection(concat($collectionsfolder, '/?select=*.xml;recurse=yes'))/tei:TEI
- return if ($pos mod 20 = $chunk) then
- (:
- To process everything, change above line to: return if (true()) then
- To get a small random-ish sample, use: return if ($pos mod 200 = 0) then
- :)
+ return if ($pos mod $numchunks = $chunk) then
{ $website }/catalog/{ $manuscript/@xml:id/data() }
{ $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:idno[1]/text() }
{ $manuscript/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type="collection"]/text() }
{ $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:repository/text() }
- { $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:institution/text() }
+ {
+ if ($manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:institution) then
+ { $manuscript/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:msDesc/tei:msIdentifier/tei:institution/text() }
+ else
+ ()
+ }
(: All manuscripts have at least one msContents element, but in multi-part manuscripts each is a child of an msPart :)
for $mscontent in $manuscript//tei:msContents
if ($mscontent/parent::tei:msPart) then
- { $website }/catalog/{ $manuscript/@xml:id/data() }#{ $mscontent/parent::tei:msPart/@xml:id/data() }
+ { $website }/catalog/{ $manuscript/@xml:id/data() }#{ ($mscontent/parent::tei:msPart/@xml:id/data(), generate-id($mscontent/parent::tei:msPart))[1] }
{ local:extractPhysicalFields($mscontent/parent::tei:msPart/tei:physDesc) }
{ local:extractDates($mscontent/parent::tei:msPart/tei:history) }
processing/analysis/mmm/simplify-tei-for-x3ml.sh
processing/analysis/mmm/simplify-tei-for-x3ml.sh
+#!/usr/bin/env bash
+# Create subfolder to keep generated files out of GitHub
+if [ ! -d "results" ]; then
+ mkdir results
+date > $LOGFILE
+# Generate the simplied XML, extracting the desired information from the ~10K TEI records for each manuscript,
+# Also output to 20 chunks, to avoids memory issues when they are subsequently processed by x3ml
+for i in {0..19}
+ java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-records-for-x3ml.xquery -o:results/manuscripts_chunk$i.xml collectionsfolder="../../../collections" chunk=$i numchunks=20 2>> $LOGFILE
+# Strip out namespaces from authority files, which x3ml cannot handle no matter how I try to declare it
+# in the mapping file. Also chunk persons and works, again so the x3ml can process them in minutes rather than hours.
+for i in {0..4}
+ java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-authorities-for-x3ml.xquery -o:results/persons_chunk$i.xml authorityfile="../../../persons.xml" chunk=$i numchunks=5 2>> $LOGFILE
+for i in {0..4}
+ java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-authorities-for-x3ml.xquery -o:results/works_chunk$i.xml authorityfile="../../../works.xml" chunk=$i numchunks=5 2>> $LOGFILE
+java -Xmx1G -cp ../../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplify-authorities-for-x3ml.xquery -o:results/places.xml authorityfile="../../../places.xml" chunk=0 numchunks=1 2>> $LOGFILE
processing/analysis/simplified-xml-for-3m.sh (deleted)
processing/analysis/simplified-xml-for-3m.sh (deleted)
-#!/usr/bin/env bash
-# Create subfolder to keep generated files out of GitHub
-if [ ! -d "results" ]; then
- mkdir results
-# Generate the simplied XML, outputting to 20 chunks, which avoids memory issues when they will be processed by x3ml
-for i in {0..19}
- java -Xmx1G -cp ../saxon/saxon9he.jar net.sf.saxon.Query -xi:on -q:simplified-xml-for-3m.xquery -o:results/simplified-xml-for-3m-chunk$i.xml chunk=$i 2>> $LOGFILE
\ No newline at end of file
processing/analysis/simplified4oxlod.xsd (deleted)
processing/analysis/simplified4oxlod.xsd (deleted)
- This schema specifies an intermediate metadata format for extracting the essentials from a TEI header
- in a predictable structure for easier mapping to RDF in the 3M editor for use by OXLOD,