Merge pull request #71 from SupervisedStylometry/binarize

beginning of something to have relative, absolute or binary freqs
SupervisedStylometry · Oct 21, 2024 · 5ec1641 · 5ec1641
2 parents 00f1458 + ab7b34d
commit 5ec1641
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 31 deletions.
diff --git a/load_corpus.py b/load_corpus.py
@@ -18,7 +18,10 @@
                         default="words", choices=["words", "chars", "affixes", "pos"])
     parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
     parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
-    parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
+    parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs",
+                        default="relative",
+                        choices=["relative", "absolute", "binary"]
+                        )
     parser.add_argument('-x', action='store', help="format (txt, xml or tei) /!\ only txt is fully implemented",
                         default="txt",
                         choices=["txt", "xml", "tei"]
@@ -59,7 +62,7 @@
         my_feats = None
 
     corpus, my_feats = load_corpus(args.s, feat_list=my_feats, feats=args.t, n=args.n, k=args.k,
-                                   relFreqs=not args.absolute_freqs, format=args.x,
+                                   freqsType=args.freqs, format=args.x,
                                    sampling=args.sampling, units=args.sample_units,
                                    size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
                                    samples_random=args.samples_random,

diff --git a/superstyl/load.py b/superstyl/load.py
@@ -5,9 +5,10 @@
 import tqdm
 import pandas
 
-def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs=True, format="txt", sampling=False,
-                units="words", size=3000, step=None, max_samples=None, samples_random=False,
-                keep_punct=False, keep_sym=False, identify_lang=False, embedding=False, neighbouring_size=10):
+
+def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsType="relative", format="txt", sampling=False,
+                units="words", size=3000, step=None, max_samples=None, samples_random=False, keep_punct=False, keep_sym=False,
+                identify_lang=False, embedding=False, neighbouring_size=10):
     """
     Main function to load a corpus from a collection of file, and an optional list of features to extract.
     :param data_paths: paths to the source files
@@ -18,7 +19,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
     :param n: n grams lengths (default 1)
     :param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
     gets its frequencies, and only include features of superior or equal total frequencies.
-    :param relFreqs: return relative frequencies (default: True)
+    :param freqsType: return relative, absolute or binarised frequencies (default: relative)
     :param format: one of txt, xml or tei. /!\ only txt is fully implemented.
     :param sampling: whether to sample the texts, by cutting it into slices of a given length, until the last possible
       slice of this length, which means that often the end of the text will be eliminated (default False)
@@ -46,15 +47,15 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
         relFreqs = False  # we need absolute freqs as a basis for embedded frequencies
         model = embed.load_embeddings(embedding)
         embeddedFreqs = True
+        freqsType = "absolute" #absolute freqs are required for embedding
 
     print(".......loading texts.......")
 
     if sampling:
         myTexts = pipe.docs_to_samples(data_paths, format=format, units=units, size=size, step=step,
                                        max_samples=max_samples, samples_random=samples_random,
                                        keep_punct=keep_punct, keep_sym=keep_sym,
-                                       identify_lang = identify_lang
-                                       )
+                                       identify_lang = identify_lang)
 
     else:
         myTexts = pipe.load_texts(data_paths, format=format, max_samples=max_samples, keep_punct=keep_punct,
@@ -63,7 +64,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
     print(".......getting features.......")
 
     if feat_list is None:
-        feat_list = fex.get_feature_list(myTexts, feats=feats, n=n, relFreqs=relFreqs)
+        feat_list = fex.get_feature_list(myTexts, feats=feats, n=n, freqsType=freqsType)
         if k > len(feat_list):
             print("K Limit ignored because the size of the list is lower ({} < {})".format(len(feat_list), k))
         else:
@@ -75,7 +76,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
     print(".......getting counts.......")
 
     my_feats = [m[0] for m in feat_list] # keeping only the features without the frequencies
-    myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, relFreqs=relFreqs)
+    myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, freqsType=freqsType)
 
     if embedding:
         print(".......embedding counts.......")

diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
@@ -49,9 +49,9 @@ def count_features(text, feats ="words", n = 1):
     #POS in english with NLTK - need to propose spacy later on
     elif feats == "pos":
         try:
-            nltk.data.find('taggers/averaged_perceptron_tagger')
+            nltk.data.find('taggers/averaged_perceptron_tagger_eng')
         except:
-            nltk.download('averaged_perceptron_tagger')
+            nltk.download('averaged_perceptron_tagger_eng')
         words = nltk.tokenize.wordpunct_tokenize(text)
         pos_tags = [pos for word, pos in nltk.pos_tag(words)]
         if n > 1:
@@ -85,12 +85,25 @@ def relative_frequencies(wordCounts, total):
         wordCounts[t] = wordCounts[t] / total
     return wordCounts
 
+def bin_frequencies(wordCounts):
+    """
+    For a counter of word counts, return the binarised frequencies
+    :param wordCounts: a dictionary of word counts
+    :return a counter of word relative frequencies
+    """
+    for t in wordCounts.keys():
+        if wordCounts[t] > 0:
+            wordCounts[t] = 1
 
-def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
+    return wordCounts
+
+
+def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"):
     """
     :param myTexts: a 'myTexts' object, containing documents to be processed
     :param feat_list: a list of features to be selected
     :param feats: type of feats (words, chars, affixes or POS)
+    :param freqsType: "relative", "absolute" or "binary" frequencies
     :param n: n-grams length
     :return: list of features, with total frequency
     """
@@ -103,33 +116,41 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
         my_feats.update(counts)
         total = total + text_total
 
-    if relFreqs:
+    if freqsType == "relative":
         my_feats = relative_frequencies(my_feats, total)
+    elif freqsType == "binary":
+        my_feats = bin_frequencies(my_feats)
 
     # sort them
     my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]
 
     return my_feats
 
 
-def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False):
+def get_counts(myTexts, feat_list=None, feats = "words", n = 1, freqsType = "relative"):
     """
     Get counts for a collection of texts
     :param myTexts: the document collection
     :param feat_list: a list of features to be selected (None for all)
     :param feats: the type of feats (words, chars, affixes, POS)
     :param n: the length of n-grams
-    :param relFreqs: whether to compute relative freqs
+    :param freqsType: relative, absolute or binarised freqs
     :return: the collection with, for each text, a 'wordCounts' dictionary
     """
 
+    if freqsType not in ["relative", "absolute", "binary"]:
+        raise ValueError("Unsupported frequency type. Choose from 'relative', 'absolute', or 'binary'.")
+
     for i in enumerate(myTexts):
 
         counts, total = count_features(myTexts[i[0]]["text"], feats=feats, n=n)
 
-        if relFreqs:
+        if freqsType == "relative":
             counts = relative_frequencies(counts, total)
 
+        elif freqsType == "binary":
+            counts = bin_frequencies(counts)
+
         if feat_list:
             # and keep only the ones in the feature list
             counts = {f: counts[f] for f in feat_list if f in counts.keys()}