diff --git a/load_corpus.py b/load_corpus.py
index d9abb20d..a6c7c448 100755
--- a/load_corpus.py
+++ b/load_corpus.py
@@ -32,6 +32,9 @@
     parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int)
     parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)",
                         default=None, type=int)
+    parser.add_argument('--samples_random', action='store_true',
+                        help="Should random sampling with replacement be performed instead of continuous sampling (default: false)",
+                        default=False)
     parser.add_argument('--keep_punct', action='store_true', help="whether to keep punctuation and caps (default is False)",
                         default=False)
     parser.add_argument('--keep_sym', action='store_true',
@@ -59,6 +62,7 @@
                                    relFreqs=not args.absolute_freqs, format=args.x,
                                    sampling=args.sampling, units=args.sample_units,
                                    size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
+                                   samples_random=args.samples_random,
                                    keep_punct=args.keep_punct, keep_sym=args.keep_sym, identify_lang=args.identify_lang,
                                    embedding=args.embedding, neighbouring_size=args.neighbouring_size
                                    )
diff --git a/superstyl/load.py b/superstyl/load.py
index 83101ffe..d4bb0869 100644
--- a/superstyl/load.py
+++ b/superstyl/load.py
@@ -6,8 +6,8 @@
 import pandas
 
 def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs=True, format="txt", sampling=False,
-                units="words", size=3000, step=None, max_samples=None, keep_punct=False, keep_sym=False,
-                identify_lang=False, embedding=False, neighbouring_size=10):
+                units="words", size=3000, step=None, max_samples=None, samples_random=False,
+                keep_punct=False, keep_sym=False, identify_lang=False, embedding=False, neighbouring_size=10):
     """
     Main function to load a corpus from a collection of file, and an optional list of features to extract.
     :param data_paths: paths to the source files
@@ -28,6 +28,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
     :param step: step for sampling with overlap (default is step = size, which means no overlap).
     Reduce for overlapping slices
     :param max_samples: Maximum number of (randomly selected) samples per author/class (default is all)
+    :param samples_random: Should random sampling with replacement be performed instead of continuous sampling (default: false)
     :param keep_punct: whether to keep punctuation and caps (default is False)
     :param keep_sym: same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False). /!\ does not
     actually keep symbols
@@ -50,7 +51,8 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
 
     if sampling:
         myTexts = pipe.docs_to_samples(data_paths, format=format, units=units, size=size, step=step,
-                                       max_samples=max_samples, keep_punct=keep_punct, keep_sym=keep_sym,
+                                       max_samples=max_samples, samples_random=samples_random,
+                                       keep_punct=keep_punct, keep_sym=keep_sym,
                                        identify_lang = identify_lang
                                        )
 
diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py
index 21b234fd..41c61168 100755
--- a/superstyl/preproc/pipe.py
+++ b/superstyl/preproc/pipe.py
@@ -92,11 +92,12 @@ def normalise(text, keep_punct=False, keep_sym=False):
 
     else:
         if keep_punct:
-            out = re.sub(r"[^\p{L}\p{P}]+", " ", text)
+            # Keep punctuation (and diacritics for now)
+            out = re.sub(r"[^\p{L}\p{P}\p{M}]+", " ", text)
 
         else:
             #out = re.sub(r"[\W0-9]+", " ", text.lower())
-            out = re.sub(r"[^\p{L}]+", " ", text.lower())
+            out = re.sub(r"[^\p{L}\p{M}]+", " ", text.lower())
 
         out = unidecode.unidecode(out)
 
@@ -173,7 +174,8 @@ def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_
 
 
 # Load and split in samples of length -n- a collection of files
-def get_samples(path, size, step=None, units="words", format="txt", keep_punct=False, keep_sym=False):
+def get_samples(path, size, step=None, samples_random=False, max_samples=10,
+                units="words", format="txt", keep_punct=False, keep_sym=False):
     """
     Take samples of n words or verses from a document, and then parse it.
     ONLY IMPLEMENTED FOR NOW: XML/TEI, TXT and verses or words as units
@@ -181,10 +183,18 @@ def get_samples(path, size, step=None, units="words", format="txt", keep_punct=F
     :param size: sample size
     :param size: size of the step when sampling successively (determines overlap) default is the same
     as sample size (i.e. no overlap)
+    :param samples_random: Should random sampling with replacement be performed instead of continuous sampling (default: false)
+    :param max_samples: maximum number of samples per author/clas
     :param units: the units to use, one of "words" or "verses"
     :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
     """
 
+    if samples_random and step is not None:
+        raise ValueError("random sampling is not compatible with continuous sampling (remove either the step or the samples_random argument")
+
+    if samples_random and not max_samples:
+        raise ValueError("random sampling needs a fixed number of samples (use the max_samples argument)")
+
     if step is None:
         step = size
 
@@ -226,15 +236,21 @@ def get_samples(path, size, step=None, units="words", format="txt", keep_punct=F
 
     # and now generating output
     samples = []
-    current = 0
-    while current + size <= len(units):
-        samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])})
-        current = current + step
+
+    if samples_random:
+        for k in range(max_samples):
+            samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units, k=size))})
+
+    else:
+        current = 0
+        while current + size <= len(units):
+            samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])})
+            current = current + step
 
     return samples
 
 
-def docs_to_samples(paths, size, step=None, units="words", format="txt", keep_punct=False,
+def docs_to_samples(paths, size, step=None, units="words", samples_random=False, format="txt", keep_punct=False,
                     keep_sym=False, max_samples=None, identify_lang=False):
     """
     Loads a collection of documents into a 'myTexts' object for further processing BUT with samples !
@@ -243,6 +259,7 @@ def docs_to_samples(paths, size, step=None, units="words", format="txt", keep_pu
     :param size: size of the step when sampling successively (determines overlap) default is the same
     as sample size (i.e. no overlap)
     :param units: the units to use, one of "words" or "verses"
+    :param samples_random: Should random sampling with replacement be performed instead of continuous sampling (default: false)
     :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
     :param keep_punct: whether to keep punctuation and caps.
     :param max_samples: maximum number of samples per author/class.
@@ -264,7 +281,8 @@ def docs_to_samples(paths, size, step=None, units="words", format="txt", keep_pu
         else:
             lang = 'NA'
 
-        samples = get_samples(path, size=size, step=step, units=units, format=format,
+        samples = get_samples(path, size=size, step=step, samples_random=samples_random, max_samples=max_samples,
+                              units=units, format=format,
                               keep_punct=keep_punct, keep_sym=keep_sym)
 
         for sample in samples:
diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py
index e79491ed..9b181b0b 100644
--- a/tests/test_load_corpus.py
+++ b/tests/test_load_corpus.py
@@ -356,6 +356,23 @@ def test_docs_to_samples(self):
         # THEN
         self.assertEqual(len([text for text in results if text["aut"] == 'Smith']), 1)
 
+        # TODO: this is just minimal testing for random sampling
+        # WHEN
+        results = superstyl.preproc.pipe.docs_to_samples(self.paths, identify_lang=False, size=2, step=None,
+                                                         units="words",
+                                                         format="txt", keep_punct=False, keep_sym=False,
+                                                         max_samples=5, samples_random=True)
+        # THEN
+        self.assertEqual(len([text for text in results if text["aut"] == 'Smith']), 5)
+
+        # and now tests that error are raised when parameters combinations are not consistent
+        # WHEN/THEN
+        self.assertRaises(ValueError, superstyl.preproc.pipe.docs_to_samples, self.paths, size=2, step=1, units="words",
+                                                         format="txt", max_samples=5, samples_random=True)
+        self.assertRaises(ValueError, superstyl.preproc.pipe.docs_to_samples, self.paths, size=2, units="words",
+                                                                             format="txt", max_samples=None,
+                                                                             samples_random=True)
+
     # TODO: test other loading formats with sampling, that are not txt (and decide on their implementation)
 
     # Testing the processing of "myTexts" objects