paracrawl · ZJaume · Apr 26, 2023 · Apr 26, 2023 · May 9, 2023 · May 11, 2023
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,7 @@
 /env/src/recode-*
 /env/src/hunspell-*
 filtered-terms.txt
+*.swp
+models/??-??
+*.sif
+core
diff --git a/03.split-text b/03.split-text
@@ -5,26 +5,20 @@ ulimit -n 16384
 SLANG="$1"
 BATCH="$2"
 
-SPLIT="perl $KPU/moses/ems/support/split-sentences.perl"
-
 echo "Processing (${SLANG}) ${BATCH}"
 
-< ${BATCH}/plain_text.gz gzip -dc \
-| $SPLIT -k -q -n -d -l $SLANG -c 524288 \
+< ${BATCH}/text.gz gzip -dc \
+| py-segment -l $SLANG \
 | gzip -9c \
-> ${TMPDIR}/sentences.$$.gz
+> ${BATCH}/sentences.$$.gz
 
 echo "Testing output"
 
-docs_pt=$(gzip -cd ${BATCH}/plain_text.gz | wc -l)
-docs_st=$(gzip -cd ${TMPDIR}/sentences.$$.gz | wc -l)
+docs_pt=$(gzip -cd ${BATCH}/text.gz | wc -l)
+docs_st=$(gzip -cd ${BATCH}/sentences.$$.gz | wc -l)
 echo "Expecting $docs_pt documents, found $docs_st"
 test $docs_pt -eq $docs_st || exit 1
 
-# Move in two steps. First copies it to the shared fs which
-# might fail because it hits a quota. Second marks it as
-# the real thing.
-mv ${TMPDIR}/sentences.$$.gz ${BATCH}/sentences.$$.gz
 mv ${BATCH}/sentences.$$.gz ${BATCH}/sentences.gz
 echo "Copied result (${SLANG}) ${BATCH}"
 
diff --git a/05.tokenise b/05.tokenise
@@ -23,25 +23,22 @@ export -f tokenise
 echo "Processing (${SLANG}) ${BATCH}"
 
 < ${BATCH}/${INPUT}.gz gzip -dc \
-| b64filter cache bash -c tokenise \
+| b64filter bash -c tokenise \
 | gzip -9c \
-> ${TMPDIR}/${OUTPUT}.$TMPSFX.gz
+> ${BATCH}/${OUTPUT}.$TMPSFX.gz
 
 echo "Checking output"
 
 docs_st=$(gzip -cd ${BATCH}/${INPUT}.gz | wc -l)
-docs_tk=$(gzip -cd ${TMPDIR}/${OUTPUT}.$TMPSFX.gz | wc -l)
+docs_tk=$(gzip -cd ${BATCH}/${OUTPUT}.$TMPSFX.gz | wc -l)
 echo "Expecting $docs_st documents, found $docs_tk"
 test $docs_st -eq $docs_tk || exit 1
 
 lines_st=$(gzip -cd ${BATCH}/${INPUT}.gz | base64 -d | wc -l)
-lines_tk=$(gzip -cd ${TMPDIR}/${OUTPUT}.$TMPSFX.gz | base64 -d | wc -l)
+lines_tk=$(gzip -cd ${BATCH}/${OUTPUT}.$TMPSFX.gz | base64 -d | wc -l)
 echo "Expecting $lines_st lines, found $lines_tk"
 test $lines_st -eq $lines_tk || exit 1
 
-# Two-step move because the first one might fail and leave an
-# incomplete file behind, which is tricky to detect.
-mv ${TMPDIR}/${OUTPUT}.$TMPSFX.gz ${BATCH}/${OUTPUT}.$TMPSFX.gz
 mv ${BATCH}/${OUTPUT}.$TMPSFX.gz ${BATCH}/${OUTPUT}.gz
 
 echo "Moved result (${SLANG}) ${BATCH}/${OUTPUT}.gz"
diff --git a/06.align b/06.align
@@ -18,7 +18,7 @@ TMPSFX=${JOB_ID:-$$}
 ${DOCALIGN} -j ${DOCALIGN_THREADS:-$THREADS} --threshold 0.1 \
 	${SRC_BATCH}/tokenised_${TARGET_LANG%~*}.gz \
 	${REF_BATCH}/tokenised_${TARGET_LANG%~*}.gz \
-| tee ${SRC_BATCH}/pairs-${TARGET_LANG%~*}-${REF_BATCH_ID}.txt \
+| cut -f2- \
 | ${DOCJOIN} \
 	-li\
 	-ri\
@@ -27,6 +27,7 @@ ${DOCALIGN} -j ${DOCALIGN_THREADS:-$THREADS} --threshold 0.1 \
 	-l ${SRC_BATCH}/sentences_${TARGET_LANG%~*}.gz\
 | /usr/bin/time -f '{"task":"bleualign", "pair":'"$PAIR_FORMAT"', "time":'"$TIME_FORMAT"'}' \
 parallel \
+	--will-cite \
 	--tmpdir=$TMPDIR \
 	-j${BLEUALIGN_THREADS:-$THREADS} \
 	--halt 2 \
@@ -36,5 +37,6 @@ parallel \
 	${BLEUALIGN} --print-sent-hash --bleu-threshold 0.2 \
 | gzip -c \
 > ${SRC_BATCH}/aligned-${REF_BATCH_ID}.gz.$TMPSFX
+
 mv ${SRC_BATCH}/aligned-${REF_BATCH_ID}.gz{.$TMPSFX,}
 
diff --git a/06.align.sh b/06.align.sh
@@ -45,13 +45,6 @@ declare -a OPTIONS=(
 	-o ${SLURM_LOGS}/06.align-%A_%a.out
 )
 
-# Quick hack, should be a --option option, but functions.sh doesn't
-# allow for that at the moment. Someday...
-if [[ ! -z ${OOM_PROOF:-} ]]; then
-	OPTIONS+=(--mem-per-cpu 12G)
-	export BLEUALIGN_THREADS=4
-fi
-
 collection=$1
 shift
 

diff --git a/07.fix b/07.fix
@@ -26,11 +26,14 @@ remove_empty_lines() {
 	awk -F"\t" '$3 != "" && $4 != "" { print }'
 }
 
+# Fix bicleaner model path for non-huggingface tools (aka bicleaner-hardrules)
+BICLEANER_MODEL_GIT_DIR=$HUGGINGFACE_HUB_CACHE/models--${BICLEANER_MODEL//\//--}
+BICLEANER_MODEL=${BICLEANER_MODEL_GIT_DIR}/snapshots/$(cat $BICLEANER_MODEL_GIT_DIR/refs/main)
+
 for match in $batch/aligned-+([0-9]).gz; do
 	echo $match 1>&2
 	matched_batch=$(echo $match | sed 's/.*-\([0-9]*\)\.gz/\1/')
 	paste <(gzip -cd ${match} \
-			| awk -F '\t' '{ print 0.0 "\t" $1 "\t"  $2}' `# bitextor's docjoin expects a score column, which it then ignores` \
 			| docjoin \
 				-r ${target_lang_data}/${shard}/${matched_batch}/url.gz \
 				-l $(dirname ${match})/url.gz) `# 1,2: target & source url`\
@@ -53,7 +56,7 @@ done \
 	--target_lang $bicleaner_lang \
 	--scol 3 \
 	--tcol 4 \
-	--metadata $BICLEANER_MODEL \
+	--metadata $BICLEANER_MODEL/metadata.yaml \
 	/dev/stdin /dev/stdout \
 | pigz -9c \
 >$HARDRULED.$TMPSFX

diff --git a/08.score b/08.score
@@ -2,6 +2,11 @@
 set -euo pipefail
 shopt -s extglob
 
+if [ "$IS_LUMI" = true ]; then
+	module load CrayEnv
+	module load rocm/5.2.3
+fi
+
 collection=$1
 lang=$2
 target_lang_data=$3
@@ -22,7 +27,6 @@ test -r $HARDRULED
 paste <(zcat $FIXED) <(zcat $HARDRULED) \
 | cache -k 3,4 ./score-wrap.py $BICLEANER $BICLEANER_PARAMS \
 	--score_only \
-	--processes $THREADS \
 	--tmp_dir $TMPDIR \
 	--disable_hardrules \
 	--disable_porn_removal \

diff --git a/08.score.sh b/08.score.sh
@@ -10,10 +10,17 @@ set -euo pipefail
 collection=$1
 shift
 
-export SBATCH_ACCOUNT=t2-cs119-gpu
-export SBATCH_PARTITION=pascal
-export SLURM_TASKS_PER_NODE=1 # No parallelism in generic.slurm plz, they'll have to share the gpu otherwise.
-export SBATCH_GRES=gpu:1
+if [ "$IS_LUMI" = true ]; then
+	export SBATCH_PARTITION="small-g"
+	export SLURM_TASKS_PER_NODE=1 # No parallelism in generic.slurm plz, they'll have to share the gpu otherwise.
+	export SBATCH_GPUS_PER_TASK=1
+	unset SBATCH_MEM_PER_CPU # If we are setting this for small partition, we don't need it for gpu jobs
+else
+	export SBATCH_ACCOUNT=t2-cs119-gpu
+	export SBATCH_PARTITION=pascal
+	export SLURM_TASKS_PER_NODE=1 # No parallelism in generic.slurm plz, they'll have to share the gpu otherwise.
+	export SBATCH_GRES=gpu:1
+fi
 
 for lang in $*; do
 	bicleaner_ai_model $lang

diff --git a/09.clean b/09.clean
@@ -42,7 +42,7 @@ paste <(pigz -dc $FIXED) <(pigz -dc $SCORED) `# add bicleaner score as the 9th c
 	>(pigz -9c > $CLASSIFIED.$TMPSFX) \
 	>(wc -wl | sed 's/^ \+//' | tr -s ' ' '\t' > $STATS.$TMPSFX) \
 | awk -F"\t" "\$9 >= ${BICLEANER_THRESHOLD}" \
-| python3 $BITEXTOR/bitextor-elrc-filtering.py -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection" -s \
+| python3 bitextor-elrc-filtering.py -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection" -s \
 | LC_ALL=C sort -t$'\t' -k7,7 -k8,8nr \
 | pigz -9c \
 > $FILTERED.$TMPSFX \

diff --git a/10.reduce-classified.sh b/10.reduce-classified.sh
@@ -20,6 +20,7 @@ for collection in $collections; do
 	batch_lists+=( $batch_list )
 done
 
+mkdir -p $DATA_CLEANING
 output_file="${DATA_CLEANING}/${TARGET_LANG}-${lang}/${TARGET_LANG%~*}-${lang%~*}.${collection_hash}.classified.gz"
 
 if [ ! -f $output_file ] || ! $RETRY; then

diff --git a/11.reduce-filtered b/11.reduce-filtered
@@ -7,8 +7,8 @@ shift
 
 # Set up temp directory
 TMPSFX=${JOB_ID:-$$}
-#TMPDIR=$(mktemp -d --tmpdir=$(dirname $output_file) --suffix=_$TMPSFX)
-TMPDIR=$(mktemp -d --tmpdir=$SCRATCH --suffix=_$TMPSFX)
+TMPDIR=$(mktemp -d --tmpdir=$(dirname $output_file) --suffix=_$TMPSFX)
+#TMPDIR=$(mktemp -d --tmpdir=$SCRATCH --suffix=_$TMPSFX)
 test -d "$TMPDIR"
 trap "rm -rf $TMPDIR" EXIT
 

diff --git a/12.reduce-tmx b/12.reduce-tmx
@@ -9,7 +9,7 @@ shift 3
 filtered_input=$@
 
 pigz -cd $filtered_input \
-| PYTHONPATH=$PREFIX/src/bitextor python3 ${SCRIPTS}/bitextor-buildTMX.py \
+| python3 bitextor-buildTMX.py \
 	--lang1 ${TARGET_LANG%~*} --lang2 ${lang} \
 	-c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection,lengthratio,numTokensSL,numTokensTL" \
 	--no-delete-seg \

diff --git a/12.reduce-tmx-deferred b/12.reduce-tmx-deferred
@@ -8,7 +8,7 @@ shift 2
 filtered_input=$@
 
 pigz -cd $filtered_input \
-| PYTHONPATH=$PREFIX/src/bitextor python3 ${SCRIPTS}/bitextor-buildTMX.py \
+| python3 bitextor-buildTMX.py \
 	--lang1 ${TARGET_LANG%~*} --lang2 ${lang} \
 	-c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection,lengthratio,numTokensSL,numTokensTL" \
 	--dedup "bifixerhash" \

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,42 @@
+FROM bitextor/bitextor:8.3
+
+COPY cirrus-scripts /cirrus-scripts
+WORKDIR /cirrus-scripts
+
+RUN git submodule update --init env/src/preprocess/
+RUN mkdir /cirrus-scripts/env/src/paracrawl/build && \
+    cd /cirrus-scripts/env/src/paracrawl/build && \
+    cmake .. && \
+    make -j8 merge_sort && \
+    cp bin/merge_sort /usr/local/bin/
+
+COPY GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB /mkl-key.pub
+RUN mkdir /etc/apt/keyrings
+RUN gpg --dearmor -o /etc/apt/keyrings/mkl.gpg /mkl-key.pub && rm /mkl-key.pub
+RUN echo "deb [signed-by=/etc/apt/keyrings/mkl.gpg] https://apt.repos.intel.com/mkl all main" > /etc/apt/sources.list.d/intel-mkl.list
+RUN apt-get update && apt-get install -yy intel-mkl-64bit-2020.0-088
+
+# Compile Marian CPU from Bergamot
+RUN git clone https://github.com/browsermt/marian-dev /opt/marian-bergamot
+WORKDIR /opt/marian-bergamot
+RUN git checkout 2be8344fcf2776fb43a7376284067164674cbfaf
+WORKDIR /opt/marian-bergamot/build
+RUN cmake .. -DUSE_SENTENCEPIECE=on -DCOMPILE_CUDA=off -DUSE_FBGEMM=on
+RUN make -j24
+
+RUN pip uninstall -y tensorflow keras
+RUN pip install tensorflow-rocm==2.12.1.600
+
+RUN apt-get remove -yy intel-mkl-64bit-2020.0-088 build-essential && apt-get -yy autoremove && \
+    rm -Rf /opt/marian-bergamot/build/src && \
+    rm -Rf /opt/marian-bergamot/src && \
+    rm -Rf /opt/marian-bergamot/build/local && \
+    rm -Rf /opt/marian-bergamot/build/libmarian.a && \
+    strip /opt/marian-bergamot/build/marian* && \
+    strip /opt/marian-bergamot/build/spm*
+
+RUN apt-get install -y locales
+RUN locale-gen en_US.UTF-8
+ENV LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8
+
+ENTRYPOINT ["/bin/bash"]
diff --git a/bitextor-buildTMX.py b/bitextor-buildTMX.py
@@ -39,8 +39,7 @@
 import unicodedata
 from xml.sax.saxutils import escape
 
-sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/utils")
-from utils.common import open_xz_or_gzip_or_plain, dummy_open
+from bitextor.utils.common import open_xz_or_gzip_or_plain, dummy_open
 
 def remove_control_characters(text):
     return "".join(ch for ch in text if unicodedata.category(ch)[0]!="C")

diff --git a/bitextor-elrc-filtering.py b/bitextor-elrc-filtering.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+#  This file is part of Bitextor.
+#
+#  Bitextor is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  Bitextor is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with Bitextor.  If not, see <https://www.gnu.org/licenses/>.
+
+import sys
+import argparse
+
+oparser = argparse.ArgumentParser(
+    description="Script that reads takes a list of aligned segments, such as that produced by bitextor-alignsegments "
+                "script, and computes the basic ELRC quality metrics: number of tokens in lang1/lang2 and length "
+                "ratio.")
+oparser.add_argument('aligned_seg', metavar='FILE', nargs='?',
+                     help='File containing the set of aliged segments (if undefined, the script reads from the '
+                          'standard input)',
+                     default=None)
+oparser.add_argument("-s", "--stats", help="Print stats or just output the input", action="store_true",
+                     dest="isPrintingStats", default=False)
+oparser.add_argument("-f", "--filtering", help="Filter lines according to ELRC rules (printing stats required)",
+                     action="store_true", dest="isFiltering", default=False)
+oparser.add_argument("-c", "--columns",
+                     help="Name of columns of the input tab separated file split by comma. Default: url1,url2,seg1,"
+                          "seg2,hunalign,bicleaner",
+                     default="url1,url2,seg1,seg2,hunalign,bicleaner")
+
+options = oparser.parse_args()
+
+if options.aligned_seg is not None:
+    reader = open(options.aligned_seg, "r")
+else:
+    reader = sys.stdin
+
+columns = options.columns.split(',')
+
+for i in reader:
+    fields = i.split("\t")
+    fields[-1] = fields[-1].strip()
+    fieldsdict = dict()
+    extracolumns = []
+
+    for field, column in zip(fields, columns):
+        fieldsdict[column] = field
+    if options.isPrintingStats:
+        extracolumns = ["lengthratio", "numTokensSL", "numTokensTL"]
+        if len(fieldsdict["seg2"]) == 0:
+            lengthRatio = 0
+        else:
+            lengthRatio = len(fieldsdict["seg1"]) * 1.0 / len(fieldsdict["seg2"])
+        numTokensSL = len(fieldsdict["seg1"].split(
+            ' '))  # This is not the way this should be counted, we need to tokenize better first
+        numTokensTL = len(fieldsdict["seg2"].split(
+            ' '))  # This is not the way this should be counted, we need to tokenize better first
+        fieldsdict["lengthratio"] = str(lengthRatio)
+        fieldsdict["numTokensSL"] = str(numTokensSL)
+        fieldsdict["numTokensTL"] = str(numTokensTL)
+        if options.isFiltering:
+            if "bicleaner" in fieldsdict and fieldsdict["bicleaner"].strip() != '':
+                fieldsdict["bicleaner"] = str(round(float(fieldsdict["bicleaner"]), 4))
+            if int(fieldsdict["numTokensSL"]) >= 200 or int(fieldsdict["numTokensTL"]) >= 200 or fieldsdict[
+                "seg1"].strip() == '' or fieldsdict["seg2"].strip() == '' or float(
+                    fieldsdict["lengthratio"]) >= 6 or float(fieldsdict["lengthratio"]) <= 0.1666:
+                continue
+    fieldstoprint = []
+    for column in columns + extracolumns:
+        fieldstoprint.append(fieldsdict[column])
+    print("\t".join(fieldstoprint))
diff --git a/cirrus-scripts.def b/cirrus-scripts.def
@@ -0,0 +1,2 @@
+bootstrap: docker-daemon
+from: cirrus-scripts:latest
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		bootstrap: docker-daemon
		from: cirrus-scripts:latest