diff --git a/load_corpus.py b/load_corpus.py index d9abb20d..a6c7c448 100755 --- a/load_corpus.py +++ b/load_corpus.py @@ -32,6 +32,9 @@ parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int) parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)", default=None, type=int) + parser.add_argument('--samples_random', action='store_true', + help="Should random sampling with replacement be performed instead of continuous sampling (default: false)", + default=False) parser.add_argument('--keep_punct', action='store_true', help="whether to keep punctuation and caps (default is False)", default=False) parser.add_argument('--keep_sym', action='store_true', @@ -59,6 +62,7 @@ relFreqs=not args.absolute_freqs, format=args.x, sampling=args.sampling, units=args.sample_units, size=args.sample_size, step=args.sample_step, max_samples=args.max_samples, + samples_random=args.samples_random, keep_punct=args.keep_punct, keep_sym=args.keep_sym, identify_lang=args.identify_lang, embedding=args.embedding, neighbouring_size=args.neighbouring_size ) diff --git a/superstyl/load.py b/superstyl/load.py index 83101ffe..d4bb0869 100644 --- a/superstyl/load.py +++ b/superstyl/load.py @@ -6,8 +6,8 @@ import pandas def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs=True, format="txt", sampling=False, - units="words", size=3000, step=None, max_samples=None, keep_punct=False, keep_sym=False, - identify_lang=False, embedding=False, neighbouring_size=10): + units="words", size=3000, step=None, max_samples=None, samples_random=False, + keep_punct=False, keep_sym=False, identify_lang=False, embedding=False, neighbouring_size=10): """ Main function to load a corpus from a collection of file, and an optional list of features to extract. :param data_paths: paths to the source files @@ -28,6 +28,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs :param step: step for sampling with overlap (default is step = size, which means no overlap). Reduce for overlapping slices :param max_samples: Maximum number of (randomly selected) samples per author/class (default is all) + :param samples_random: Should random sampling with replacement be performed instead of continuous sampling (default: false) :param keep_punct: whether to keep punctuation and caps (default is False) :param keep_sym: same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False). /!\ does not actually keep symbols @@ -50,7 +51,8 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs if sampling: myTexts = pipe.docs_to_samples(data_paths, format=format, units=units, size=size, step=step, - max_samples=max_samples, keep_punct=keep_punct, keep_sym=keep_sym, + max_samples=max_samples, samples_random=samples_random, + keep_punct=keep_punct, keep_sym=keep_sym, identify_lang = identify_lang ) diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py index 21b234fd..41c61168 100755 --- a/superstyl/preproc/pipe.py +++ b/superstyl/preproc/pipe.py @@ -92,11 +92,12 @@ def normalise(text, keep_punct=False, keep_sym=False): else: if keep_punct: - out = re.sub(r"[^\p{L}\p{P}]+", " ", text) + # Keep punctuation (and diacritics for now) + out = re.sub(r"[^\p{L}\p{P}\p{M}]+", " ", text) else: #out = re.sub(r"[\W0-9]+", " ", text.lower()) - out = re.sub(r"[^\p{L}]+", " ", text.lower()) + out = re.sub(r"[^\p{L}\p{M}]+", " ", text.lower()) out = unidecode.unidecode(out) @@ -173,7 +174,8 @@ def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_ # Load and split in samples of length -n- a collection of files -def get_samples(path, size, step=None, units="words", format="txt", keep_punct=False, keep_sym=False): +def get_samples(path, size, step=None, samples_random=False, max_samples=10, + units="words", format="txt", keep_punct=False, keep_sym=False): """ Take samples of n words or verses from a document, and then parse it. ONLY IMPLEMENTED FOR NOW: XML/TEI, TXT and verses or words as units @@ -181,10 +183,18 @@ def get_samples(path, size, step=None, units="words", format="txt", keep_punct=F :param size: sample size :param size: size of the step when sampling successively (determines overlap) default is the same as sample size (i.e. no overlap) + :param samples_random: Should random sampling with replacement be performed instead of continuous sampling (default: false) + :param max_samples: maximum number of samples per author/clas :param units: the units to use, one of "words" or "verses" :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED) """ + if samples_random and step is not None: + raise ValueError("random sampling is not compatible with continuous sampling (remove either the step or the samples_random argument") + + if samples_random and not max_samples: + raise ValueError("random sampling needs a fixed number of samples (use the max_samples argument)") + if step is None: step = size @@ -226,15 +236,21 @@ def get_samples(path, size, step=None, units="words", format="txt", keep_punct=F # and now generating output samples = [] - current = 0 - while current + size <= len(units): - samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])}) - current = current + step + + if samples_random: + for k in range(max_samples): + samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units, k=size))}) + + else: + current = 0 + while current + size <= len(units): + samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])}) + current = current + step return samples -def docs_to_samples(paths, size, step=None, units="words", format="txt", keep_punct=False, +def docs_to_samples(paths, size, step=None, units="words", samples_random=False, format="txt", keep_punct=False, keep_sym=False, max_samples=None, identify_lang=False): """ Loads a collection of documents into a 'myTexts' object for further processing BUT with samples ! @@ -243,6 +259,7 @@ def docs_to_samples(paths, size, step=None, units="words", format="txt", keep_pu :param size: size of the step when sampling successively (determines overlap) default is the same as sample size (i.e. no overlap) :param units: the units to use, one of "words" or "verses" + :param samples_random: Should random sampling with replacement be performed instead of continuous sampling (default: false) :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED) :param keep_punct: whether to keep punctuation and caps. :param max_samples: maximum number of samples per author/class. @@ -264,7 +281,8 @@ def docs_to_samples(paths, size, step=None, units="words", format="txt", keep_pu else: lang = 'NA' - samples = get_samples(path, size=size, step=step, units=units, format=format, + samples = get_samples(path, size=size, step=step, samples_random=samples_random, max_samples=max_samples, + units=units, format=format, keep_punct=keep_punct, keep_sym=keep_sym) for sample in samples: diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py index e79491ed..9b181b0b 100644 --- a/tests/test_load_corpus.py +++ b/tests/test_load_corpus.py @@ -356,6 +356,23 @@ def test_docs_to_samples(self): # THEN self.assertEqual(len([text for text in results if text["aut"] == 'Smith']), 1) + # TODO: this is just minimal testing for random sampling + # WHEN + results = superstyl.preproc.pipe.docs_to_samples(self.paths, identify_lang=False, size=2, step=None, + units="words", + format="txt", keep_punct=False, keep_sym=False, + max_samples=5, samples_random=True) + # THEN + self.assertEqual(len([text for text in results if text["aut"] == 'Smith']), 5) + + # and now tests that error are raised when parameters combinations are not consistent + # WHEN/THEN + self.assertRaises(ValueError, superstyl.preproc.pipe.docs_to_samples, self.paths, size=2, step=1, units="words", + format="txt", max_samples=5, samples_random=True) + self.assertRaises(ValueError, superstyl.preproc.pipe.docs_to_samples, self.paths, size=2, units="words", + format="txt", max_samples=None, + samples_random=True) + # TODO: test other loading formats with sampling, that are not txt (and decide on their implementation) # Testing the processing of "myTexts" objects