Skip to content

Commit

Permalink
Merge pull request #71 from SupervisedStylometry/binarize
Browse files Browse the repository at this point in the history
beginning of something to have relative, absolute or binary freqs
  • Loading branch information
Jean-Baptiste-Camps authored Oct 21, 2024
2 parents 00f1458 + ab7b34d commit 5ec1641
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 31 deletions.
7 changes: 5 additions & 2 deletions load_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
default="words", choices=["words", "chars", "affixes", "pos"])
parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs",
default="relative",
choices=["relative", "absolute", "binary"]
)
parser.add_argument('-x', action='store', help="format (txt, xml or tei) /!\ only txt is fully implemented",
default="txt",
choices=["txt", "xml", "tei"]
Expand Down Expand Up @@ -59,7 +62,7 @@
my_feats = None

corpus, my_feats = load_corpus(args.s, feat_list=my_feats, feats=args.t, n=args.n, k=args.k,
relFreqs=not args.absolute_freqs, format=args.x,
freqsType=args.freqs, format=args.x,
sampling=args.sampling, units=args.sample_units,
size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
samples_random=args.samples_random,
Expand Down
17 changes: 9 additions & 8 deletions superstyl/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
import tqdm
import pandas

def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs=True, format="txt", sampling=False,
units="words", size=3000, step=None, max_samples=None, samples_random=False,
keep_punct=False, keep_sym=False, identify_lang=False, embedding=False, neighbouring_size=10):

def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsType="relative", format="txt", sampling=False,
units="words", size=3000, step=None, max_samples=None, samples_random=False, keep_punct=False, keep_sym=False,
identify_lang=False, embedding=False, neighbouring_size=10):
"""
Main function to load a corpus from a collection of file, and an optional list of features to extract.
:param data_paths: paths to the source files
Expand All @@ -18,7 +19,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
:param n: n grams lengths (default 1)
:param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
gets its frequencies, and only include features of superior or equal total frequencies.
:param relFreqs: return relative frequencies (default: True)
:param freqsType: return relative, absolute or binarised frequencies (default: relative)
:param format: one of txt, xml or tei. /!\ only txt is fully implemented.
:param sampling: whether to sample the texts, by cutting it into slices of a given length, until the last possible
slice of this length, which means that often the end of the text will be eliminated (default False)
Expand Down Expand Up @@ -46,15 +47,15 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
relFreqs = False # we need absolute freqs as a basis for embedded frequencies
model = embed.load_embeddings(embedding)
embeddedFreqs = True
freqsType = "absolute" #absolute freqs are required for embedding

print(".......loading texts.......")

if sampling:
myTexts = pipe.docs_to_samples(data_paths, format=format, units=units, size=size, step=step,
max_samples=max_samples, samples_random=samples_random,
keep_punct=keep_punct, keep_sym=keep_sym,
identify_lang = identify_lang
)
identify_lang = identify_lang)

else:
myTexts = pipe.load_texts(data_paths, format=format, max_samples=max_samples, keep_punct=keep_punct,
Expand All @@ -63,7 +64,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
print(".......getting features.......")

if feat_list is None:
feat_list = fex.get_feature_list(myTexts, feats=feats, n=n, relFreqs=relFreqs)
feat_list = fex.get_feature_list(myTexts, feats=feats, n=n, freqsType=freqsType)
if k > len(feat_list):
print("K Limit ignored because the size of the list is lower ({} < {})".format(len(feat_list), k))
else:
Expand All @@ -75,7 +76,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
print(".......getting counts.......")

my_feats = [m[0] for m in feat_list] # keeping only the features without the frequencies
myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, relFreqs=relFreqs)
myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, freqsType=freqsType)

if embedding:
print(".......embedding counts.......")
Expand Down
35 changes: 28 additions & 7 deletions superstyl/preproc/features_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ def count_features(text, feats ="words", n = 1):
#POS in english with NLTK - need to propose spacy later on
elif feats == "pos":
try:
nltk.data.find('taggers/averaged_perceptron_tagger')
nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except:
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
words = nltk.tokenize.wordpunct_tokenize(text)
pos_tags = [pos for word, pos in nltk.pos_tag(words)]
if n > 1:
Expand Down Expand Up @@ -85,12 +85,25 @@ def relative_frequencies(wordCounts, total):
wordCounts[t] = wordCounts[t] / total
return wordCounts

def bin_frequencies(wordCounts):
"""
For a counter of word counts, return the binarised frequencies
:param wordCounts: a dictionary of word counts
:return a counter of word relative frequencies
"""
for t in wordCounts.keys():
if wordCounts[t] > 0:
wordCounts[t] = 1

def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
return wordCounts


def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"):
"""
:param myTexts: a 'myTexts' object, containing documents to be processed
:param feat_list: a list of features to be selected
:param feats: type of feats (words, chars, affixes or POS)
:param freqsType: "relative", "absolute" or "binary" frequencies
:param n: n-grams length
:return: list of features, with total frequency
"""
Expand All @@ -103,33 +116,41 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
my_feats.update(counts)
total = total + text_total

if relFreqs:
if freqsType == "relative":
my_feats = relative_frequencies(my_feats, total)
elif freqsType == "binary":
my_feats = bin_frequencies(my_feats)

# sort them
my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]

return my_feats


def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False):
def get_counts(myTexts, feat_list=None, feats = "words", n = 1, freqsType = "relative"):
"""
Get counts for a collection of texts
:param myTexts: the document collection
:param feat_list: a list of features to be selected (None for all)
:param feats: the type of feats (words, chars, affixes, POS)
:param n: the length of n-grams
:param relFreqs: whether to compute relative freqs
:param freqsType: relative, absolute or binarised freqs
:return: the collection with, for each text, a 'wordCounts' dictionary
"""

if freqsType not in ["relative", "absolute", "binary"]:
raise ValueError("Unsupported frequency type. Choose from 'relative', 'absolute', or 'binary'.")

for i in enumerate(myTexts):

counts, total = count_features(myTexts[i[0]]["text"], feats=feats, n=n)

if relFreqs:
if freqsType == "relative":
counts = relative_frequencies(counts, total)

elif freqsType == "binary":
counts = bin_frequencies(counts)

if feat_list:
# and keep only the ones in the feature list
counts = {f: counts[f] for f in feat_list if f in counts.keys()}
Expand Down
Loading

0 comments on commit 5ec1641

Please sign in to comment.