From fc3e85a7a8b06626d0aa39c31d46ee9bbc7fefbd Mon Sep 17 00:00:00 2001
From: Hady Osman <hadyos@gmail.com>
Date: Thu, 24 Feb 2022 22:14:33 +1300
Subject: [PATCH 1/3] Initial work in progress

---
 package.json                       |   1 +
 src/components/suggestionsPopup.ts |  22 ++--
 src/indexing/indexer.ts            |  15 ++-
 src/indexing/utils.spec.ts         | 169 ++++++++++++++++++++++++-----
 src/indexing/utils.ts              |  86 ++++++++++++++-
 src/search/index.ts                |  26 ++++-
 webpack.config.ts                  |   1 +
 7 files changed, 276 insertions(+), 44 deletions(-)
diff --git a/package.json b/package.json
index dcab5f8..b6781c9 100644
--- a/package.json
+++ b/package.json
@@ -69,6 +69,7 @@
     "*.{js,css,md}": "prettier --write"
   },
   "dependencies": {
+    "@liquicode/lib-tokenize": "^0.1.4",
     "@tanishiking/aho-corasick": "^0.0.1",
     "@types/natural": "^5.1.0",
     "lodash": "^4.17.21",
diff --git a/src/components/suggestionsPopup.ts b/src/components/suggestionsPopup.ts
index 7eee461..9c81b32 100644
--- a/src/components/suggestionsPopup.ts
+++ b/src/components/suggestionsPopup.ts
@@ -14,16 +14,18 @@ const item = (icon, title, click) => {
 export const showSuggestionsModal = (props: SuggestionsModalProps): void => {
   const { app, mouseEvent, suggestions, onClick } = props;
 
-  const menu = new Menu(app);
+  setTimeout(() => {
+    const menu = new Menu(app);
 
-  suggestions.forEach((replaceText) => {
-    menu.addItem(
-      item('pencil', `Replace with ${replaceText}`, () => {
-        onClick(replaceText);
-      })
-    );
-  });
+    suggestions.forEach((replaceText) => {
+      menu.addItem(
+        item('pencil', `Replace with ${replaceText}`, () => {
+          onClick(replaceText);
+        })
+      );
+    });
 
-  menu.addSeparator();
-  menu.showAtMouseEvent(mouseEvent);
+    menu.addSeparator();
+    menu.showAtMouseEvent(mouseEvent);
+  }, 100);
 };
diff --git a/src/indexing/indexer.ts b/src/indexing/indexer.ts
index 787ae00..8244793 100644
--- a/src/indexing/indexer.ts
+++ b/src/indexing/indexer.ts
@@ -1,14 +1,16 @@
+import _ from 'lodash';
 import lokijs from 'lokijs';
 import { TypedEmitter } from 'tiny-typed-emitter';
 import type { TFile } from 'obsidian';
 
-import { tokenize } from './utils';
+import { tokenizeWithStem } from './utils';
 import type { PluginHelper } from '../plugin-helper';
 
 type Document = {
   fileCreationTime: number;
   type: 'tag' | 'alias' | 'page' | 'page-token';
   keyword: string;
+  originalText: string;
   replaceText: string;
 };
 
@@ -34,9 +36,11 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
     // Exclude any keywords associated with active file as we don't want recursive highlighting
     const exclusionFile = this.pluginHelper.activeFile;
 
-    return this.documents
+    const keywords = this.documents
       .where((doc) => doc.fileCreationTime !== exclusionFile.stat.ctime)
       .map((doc) => doc.keyword);
+
+    return _.uniq(keywords);
   }
 
   public getDocumentsByKeyword(keyword: string): Document[] {
@@ -45,6 +49,7 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
 
   public buildIndex(): void {
     this.pluginHelper.getAllFiles().forEach((file) => this.indexFile(file));
+    console.log('index has been built', this.documents);
     this.emit('indexRebuilt');
   }
 
@@ -63,14 +68,16 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
       fileCreationTime: file.stat.ctime,
       type: 'page',
       keyword: file.basename.toLowerCase(),
+      originalText: file.basename,
       replaceText: `[[${file.basename}]]`,
     });
 
-    tokenize(file.basename).forEach((token) => {
+    tokenizeWithStem(file.basename).forEach((token) => {
       this.documents.insert({
         fileCreationTime: file.stat.ctime,
         type: 'page-token',
         keyword: token,
+        originalText: file.basename,
         replaceText: `[[${file.basename}]]`,
       });
     });
@@ -80,6 +87,7 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
         fileCreationTime: file.stat.ctime,
         type: 'alias',
         keyword: alias.toLowerCase(),
+        originalText: file.basename,
         replaceText: `[[${file.basename}|${alias}]]`,
       });
     });
@@ -89,6 +97,7 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
         fileCreationTime: file.stat.ctime,
         type: 'tag',
         keyword: tag.replace(/#/, '').toLowerCase(),
+        originalText: tag,
         replaceText: tag,
       });
     });
diff --git a/src/indexing/utils.spec.ts b/src/indexing/utils.spec.ts
index ab71c50..45c7dc4 100644
--- a/src/indexing/utils.spec.ts
+++ b/src/indexing/utils.spec.ts
@@ -1,29 +1,146 @@
-import { tokenize } from './utils';
-
-describe('tokenize', () => {
-  const dataSet = [
-    {
-      sentence: 'The quick brown fox jumps over the lazy dog.',
-      expected: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog'],
-    },
-    {
-      sentence: 'GitHub Forks',
-      expected: ['github', 'fork'],
-    },
-    {
-      sentence: 'John    Doe',
-      expected: ['john', 'doe'],
-    },
-    {
-      sentence: 'Approximate Inference',
-      expected: ['approxim', 'infer'],
-    },
-  ];
-
-  dataSet.forEach(({ sentence, expected }) => {
-    it(`Tokenizes and removes stop words ("${sentence}", [${expected}]`, () => {
-      const tokens = tokenize(sentence);
-      expect(tokens).toEqual(expected);
+import { Emit } from '@tanishiking/aho-corasick';
+
+import {
+  bigramStemmedTokens,
+  tokenizeWithStem,
+  tokenizeText,
+  stemTokens,
+  mapStemmedEmitsToOriginal,
+} from './utils';
+
+describe.only('utils', () => {
+  describe('tokenizeWithStem', () => {
+    const dataSet = [
+      {
+        sentence: 'The quick brown fox jumps over the lazy dog.',
+        expected: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog'],
+      },
+      {
+        sentence: 'GitHub Forks',
+        expected: ['github', 'fork'],
+      },
+      {
+        sentence: 'John    Doe',
+        expected: ['john', 'doe'],
+      },
+      {
+        sentence: 'Approximate Inference',
+        expected: ['approxim', 'infer'],
+      },
+    ];
+
+    dataSet.forEach(({ sentence, expected }) => {
+      it(`Tokenizes and removes stop words ("${sentence}", [${expected}]`, () => {
+        const tokens = tokenizeWithStem(sentence);
+        expect(tokens).toEqual(expected);
+      });
+    });
+  });
+
+  describe('bigramStemmedTokens', () => {
+    const dataSet = [
+      {
+        sentence: 'John',
+        expected: ['john'],
+      },
+      {
+        sentence: 'John    Doe',
+        expected: ['john', 'doe'],
+      },
+      {
+        sentence: 'GitHub Forking tutorial',
+        expected: ['github fork', 'fork tutori'],
+      },
+      {
+        sentence: 'The Five Dysfunctions of a Team',
+        expected: ['five dysfunct', 'dysfunct team'],
+      },
+      {
+        sentence: 'The Girl with the Dragon Tattoo',
+        expected: ['five dysfunct', 'dysfunct team'],
+      },
+      {
+        sentence: 'The 7 Habits of Highly Effective People',
+        expected: ['five dysfunct', 'dysfunct team'],
+      },
+      {
+        sentence: 'Code that changes together stays together',
+        expected: ['five dysfunct', 'dysfunct team'],
+      },
+      {
+        sentence: "You rise to your level of your leadership's incompetence",
+        expected: ['five dysfunct', 'dysfunct team'],
+      },
+      {
+        sentence: 'Shortening the feedback cycle',
+        expected: ['five dysfunct', 'dysfunct team'],
+      },
+    ];
+
+    dataSet.forEach(({ sentence, expected }) => {
+      it(`Generates stemmed bigram tokens ("${sentence}", [${expected}]`, () => {
+        const bigramTokens = bigramStemmedTokens(sentence);
+        expect(bigramTokens).toEqual(expected);
+      });
+    });
+  });
+
+  describe('tokenizeText', () => {
+    it('Tokenize a sentence into an array of tokens', () => {
+      const paragraph = 'The "quick fox" jumps; ~over “the” _lazy dog.';
+
+      const tokens = tokenizeText(paragraph);
+
+      expect(tokens.map((t) => t.token).join('')).toEqual(paragraph);
+      expect(tokens.length).toEqual(16);
+    });
+
+    it('Tokenize a sentence with a line break', () => {
+      const paragraph = `This is a test note.
+
+spanning
+
+multiple lines`;
+
+      const tokens = tokenizeText(paragraph);
+
+      expect(tokens.map((t) => t.token).join('')).toEqual(paragraph);
+      expect(tokens.length).toEqual(15);
+    });
+
+    it('Tokenize a sentence with an apostrophe', () => {
+      const paragraph = `1. “Shared client record”`;
+
+      const tokens = tokenizeText(paragraph);
+
+      expect(tokens.map((t) => t.token).join('')).toEqual(paragraph);
+      expect(tokens.length).toEqual(8);
+    });
+
+    it('Stems a sentence', () => {
+      const paragraph =
+        'The quick brown fox jumps over the changing, patiently; waiting doggy.';
+      const expected = 'the quick brown fox jump over the chang, patient; wait doggi.';
+
+      const tokens = tokenizeText(paragraph);
+      const stems = stemTokens(tokens);
+
+      expect(stems.map((t) => t.stem).join('')).toEqual(expected);
+    });
+
+    it('Map emitted stems to root tokens', () => {
+      const paragraph = 'The connecting and the consulting spirit'; // Maps to 'the connect and the consult spirit'
+      const searchEmits: Emit[] = [new Emit(4, 11, 'connect'), new Emit(20, 27, 'consult')];
+
+      const expectedMappedEmits: Emit[] = [
+        new Emit(4, 14, 'connecting'),
+        new Emit(23, 33, 'consulting'),
+      ];
+
+      const stems = stemTokens(tokenizeText(paragraph));
+      const mappedEmits = mapStemmedEmitsToOriginal(stems, searchEmits);
+
+      expect(mappedEmits).toEqual(expectedMappedEmits);
     });
   });
 });
diff --git a/src/indexing/utils.ts b/src/indexing/utils.ts
index 3318cd4..80e7be3 100644
--- a/src/indexing/utils.ts
+++ b/src/indexing/utils.ts
@@ -1,5 +1,85 @@
-import natural from 'natural';
+import LIB_TOKENIZE from '@liquicode/lib-tokenize';
+import { Emit } from '@tanishiking/aho-corasick';
+import { PorterStemmer, NGrams, WordPunctTokenizer } from 'natural';
 
-export const tokenize = (text: string): string[] => {
-  return natural.PorterStemmer.tokenizeAndStem(text);
+const tokenizer = LIB_TOKENIZE.NewTokenizer();
+tokenizer.whitespace = ` \t\r\n.“”`;
+tokenizer.symbols = `,;=`;
+tokenizer.literal_delimiters = `"`;
+tokenizer.literal_escape_chars = `\\`;
+
+const tokenizer2 = new WordPunctTokenizer();
+
+/**
+ * Tokenizes a string into words along with:
+ * (a) Removing stop words
+ * (b) Removing punctuation
+ * (c) Stemming words
+ */
+export const tokenizeWithStem = (text: string): string[] => {
+  return PorterStemmer.tokenizeAndStem(text);
+};
+
+export const bigramStemmedTokens = (text: string): string[] => {
+  const tokens = tokenizeWithStem(text);
+
+  if (tokens.length > 2) {
+    const bigrams = NGrams.bigrams(tokens);
+    return bigrams.map((bigram) => bigram.join(' '));
+  }
+
+  return tokens;
+};
+
+type Token = {
+  at: number;
+  token: string;
+  type: 'wsp' | 'sym' | 'lit' | 'idf' | 'num' | 'kwd';
+};
+
+type StemmedToken = {
+  stem: string;
+  stemStart: number;
+  stemEnd: number;
+  original: string;
+  originalStart: number;
+  originalEnd: number;
+};
+
+export const tokenizeText = (text: string): Token[] => {
+  console.log(tokenizer2.tokenize(text))
+  console.log(tokenizer.tokenize(text))
+
+  return tokenizer.tokenize(text);
+};
+
+export const stemTokens = (tokens: Token[]): StemmedToken[] => {
+  let index = 0;
+
+  return tokens.reduce((acc, t) => {
+    const stem = PorterStemmer.stem(t.token);
+
+    acc.push({
+      stem,
+      stemStart: index,
+      stemEnd: index + stem.length,
+      original: t.token,
+      originalStart: t.at,
+      originalEnd: t.at + t.token.length,
+    });
+
+    index += stem.length;
+
+    return acc;
+  }, []);
+};
+
+export const mapStemmedEmitsToOriginal = (stems: StemmedToken[], emits: Emit[]): Emit[] => {
+  console.log('stemmed tokens', stems);
+
+  return emits.map((e) => {
+    console.log('looking for in tokens', e);
+    const matchingStem = stems.find((s) => s.stemStart === e.start);
+    return new Emit(matchingStem.originalStart, matchingStem.originalEnd, e.keyword);
+  });
 };
diff --git a/src/search/index.ts b/src/search/index.ts
index 79944c8..7ab9e00 100644
--- a/src/search/index.ts
+++ b/src/search/index.ts
@@ -3,6 +3,8 @@ import { Trie, Emit } from '@tanishiking/aho-corasick';
 
 import type { Indexer } from '../indexing/indexer';
 
+import { tokenizeText, stemTokens, mapStemmedEmitsToOriginal } from '../indexing/utils';
+
 type SearchResult = {
   start: number;
   end: number;
@@ -35,9 +37,29 @@ export default class Search {
   public find(text: string): SearchResult[] {
     const redactedText = this.redactText(text); // Redact text that we don't want to be searched
 
-    const results = this.trie.parseText(redactedText);
+    console.log('redactedText');
+    console.log(redactedText);
+
+    // Stem the text
+    const stemmedTokens = stemTokens(tokenizeText(redactedText));
+    const stemmedText = stemmedTokens.map((t) => t.stem).join('');
+
+    console.log('stemmedText');
+    console.log(stemmedText);
+
+    // Search stemmed text
+    const stemmedResults = this.trie.parseText(stemmedText);
+
+    console.log('stemmedResults');
+    console.log(stemmedResults);
+
+    // Map stemmed results to original text
+    const originalResults = mapStemmedEmitsToOriginal(stemmedTokens, stemmedResults);
+
+    console.log('originalResults');
+    console.log(originalResults);
 
-    return this.mapToSearchResults(results);
+    return this.mapToSearchResults(originalResults);
   }
 
   private mapToSearchResults(results: Emit[]): SearchResult[] {
diff --git a/webpack.config.ts b/webpack.config.ts
index 1779028..09b828a 100644
--- a/webpack.config.ts
+++ b/webpack.config.ts
@@ -72,6 +72,7 @@ const config: Configuration = {
     '@codemirror/view': 'commonjs2 @codemirror/view',
     '@codemirror/state': 'commonjs2 @codemirror/state',
     '@codemirror/rangeset': 'commonjs2 @codemirror/rangeset',
+    'webworker-threads': 'require(webworker-threads)',
   },
 };
 

From 6517263a6bfdf56ac7e4430632bda5f049e106f7 Mon Sep 17 00:00:00 2001
From: Hady Osman <hadyos@gmail.com>
Date: Sun, 27 Feb 2022 01:04:45 +1300
Subject: [PATCH 2/3] Re-engineer index to match both partial and multiple
 words

---
 manifest.json                                 |   2 +-
 package.json                                  |   2 +-
 src/cmExtension/suggestionsExtension.ts       |  10 +-
 src/indexing/indexer.ts                       |  11 +-
 src/indexing/utils.spec.ts                    | 146 ------------------
 src/indexing/utils.ts                         |  85 ----------
 src/search/index.ts                           |  55 +++----
 src/search/mapStemToOriginalText.ts           |  28 ++++
 ...earch.utils.spec.ts => redactText.spec.ts} |   2 +-
 src/search/{search.utils.ts => redactText.ts} |   0
 src/search/search.spec.ts                     |  81 ++++++++++
 src/stemmers/index.ts                         |  21 +++
 src/tokenizers/index.ts                       |  86 +++++++++++
 src/tokenizers/tokenizer.spec.ts              |  93 +++++++++++
 14 files changed, 343 insertions(+), 279 deletions(-)
 delete mode 100644 src/indexing/utils.spec.ts
 delete mode 100644 src/indexing/utils.ts
 create mode 100644 src/search/mapStemToOriginalText.ts
 rename src/search/{search.utils.spec.ts => redactText.spec.ts} (97%)
 rename src/search/{search.utils.ts => redactText.ts} (100%)
 create mode 100644 src/search/search.spec.ts
 create mode 100644 src/stemmers/index.ts
 create mode 100644 src/tokenizers/index.ts
 create mode 100644 src/tokenizers/tokenizer.spec.ts

diff --git a/manifest.json b/manifest.json
index e30c5b6..e474c4e 100644
--- a/manifest.json
+++ b/manifest.json
@@ -2,7 +2,7 @@
   "id": "obsidian-sidekick",
   "name": "Sidekick",
   "description": "A companion to identify hidden connections that match your tags and pages",
-  "version": "1.4.3",
+  "version": "1.5.0",
   "minAppVersion": "0.13.8",
   "author": "Hady Osman",
   "authorUrl": "https://hady.geek.nz",
diff --git a/package.json b/package.json
index d8ae268..6eaa209 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "obsidian-sidekick",
-  "version": "1.4.3",
+  "version": "1.5.0",
   "description": "A companion to identify hidden connections that match your tags and pages",
   "main": "src/index.ts",
   "repository": {
diff --git a/src/cmExtension/suggestionsExtension.ts b/src/cmExtension/suggestionsExtension.ts
index 718e8de..a43e0cb 100644
--- a/src/cmExtension/suggestionsExtension.ts
+++ b/src/cmExtension/suggestionsExtension.ts
@@ -16,11 +16,11 @@ import './suggestionsExtension.css';
 
 const SuggestionCandidateClass = 'cm-suggestion-candidate';
 
-const underlineDecoration = (start: number, end: number, keyword: string) =>
+const underlineDecoration = (start: number, end: number, indexKeyword: string) =>
   Decoration.mark({
     class: SuggestionCandidateClass,
     attributes: {
-      'data-keyword': keyword,
+      'data-index-keyword': indexKeyword,
       'data-position-start': `${start}`,
       'data-position-end': `${end}`,
     },
@@ -68,7 +68,7 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin<Plugi
             const end = from + result.end;
 
             // Add the decoration
-            builder.add(start, end, underlineDecoration(start, end, result.keyword));
+            builder.add(start, end, underlineDecoration(start, end, result.indexKeyword));
           }
         }
 
@@ -89,13 +89,13 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin<Plugi
           }
 
           // Extract position and replacement text from target element data attributes state
-          const { positionStart, positionEnd, keyword } = target.dataset;
+          const { positionStart, positionEnd, indexKeyword } = target.dataset;
 
           // Show suggestions modal
           showSuggestionsModal({
             app,
             mouseEvent: e,
-            suggestions: search.getReplacementSuggestions(keyword),
+            suggestions: search.getReplacementSuggestions(indexKeyword),
             onClick: (replaceText) => {
               view.dispatch({
                 changes: {
diff --git a/src/indexing/indexer.ts b/src/indexing/indexer.ts
index 8244793..3fbc4c5 100644
--- a/src/indexing/indexer.ts
+++ b/src/indexing/indexer.ts
@@ -3,7 +3,8 @@ import lokijs from 'lokijs';
 import { TypedEmitter } from 'tiny-typed-emitter';
 import type { TFile } from 'obsidian';
 
-import { tokenizeWithStem } from './utils';
+import { stemPhrase } from '../stemmers';
+import { WordPermutationsTokenizer } from '../tokenizers';
 import type { PluginHelper } from '../plugin-helper';
 
 type Document = {
@@ -21,6 +22,7 @@ interface IndexerEvents {
 
 export class Indexer extends TypedEmitter<IndexerEvents> {
   private documents: Collection<Document>;
+  private permutationTokenizer: WordPermutationsTokenizer;
 
   constructor(private pluginHelper: PluginHelper) {
     super();
@@ -30,6 +32,8 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
     this.documents = db.addCollection<Document>('documents', {
       indices: ['fileCreationTime', 'keyword'],
     });
+
+    this.permutationTokenizer = new WordPermutationsTokenizer();
   }
 
   public getKeywords(): string[] {
@@ -49,7 +53,6 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
 
   public buildIndex(): void {
     this.pluginHelper.getAllFiles().forEach((file) => this.indexFile(file));
-    console.log('index has been built', this.documents);
     this.emit('indexRebuilt');
   }
 
@@ -67,12 +70,12 @@ export class Indexer extends TypedEmitter<IndexerEvents> {
     this.documents.insert({
       fileCreationTime: file.stat.ctime,
       type: 'page',
-      keyword: file.basename.toLowerCase(),
+      keyword: stemPhrase(file.basename),
       originalText: file.basename,
       replaceText: `[[${file.basename}]]`,
     });
 
-    tokenizeWithStem(file.basename).forEach((token) => {
+    this.permutationTokenizer.tokenize(file.basename).forEach((token) => {
       this.documents.insert({
         fileCreationTime: file.stat.ctime,
         type: 'page-token',
diff --git a/src/indexing/utils.spec.ts b/src/indexing/utils.spec.ts
deleted file mode 100644
index 45c7dc4..0000000
--- a/src/indexing/utils.spec.ts
+++ /dev/null
@@ -1,146 +0,0 @@
-import { Emit } from '@tanishiking/aho-corasick';
-
-import {
-  bigramStemmedTokens,
-  tokenizeWithStem,
-  tokenizeText,
-  stemTokens,
-  mapStemmedEmitsToOriginal,
-} from './utils';
-
-describe.only('utils', () => {
-  describe('tokenizeWithStem', () => {
-    const dataSet = [
-      {
-        sentence: 'The quick brown fox jumps over the lazy dog.',
-        expected: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog'],
-      },
-      {
-        sentence: 'GitHub Forks',
-        expected: ['github', 'fork'],
-      },
-      {
-        sentence: 'John    Doe',
-        expected: ['john', 'doe'],
-      },
-      {
-        sentence: 'Approximate Inference',
-        expected: ['approxim', 'infer'],
-      },
-    ];
-
-    dataSet.forEach(({ sentence, expected }) => {
-      it(`Tokenizes and removes stop words ("${sentence}", [${expected}]`, () => {
-        const tokens = tokenizeWithStem(sentence);
-        expect(tokens).toEqual(expected);
-      });
-    });
-  });
-
-  describe('bigramStemmedTokens', () => {
-    const dataSet = [
-      {
-        sentence: 'John',
-        expected: ['john'],
-      },
-      {
-        sentence: 'John    Doe',
-        expected: ['john', 'doe'],
-      },
-      {
-        sentence: 'GitHub Forking tutorial',
-        expected: ['github fork', 'fork tutori'],
-      },
-      {
-        sentence: 'The Five Dysfunctions of a Team',
-        expected: ['five dysfunct', 'dysfunct team'],
-      },
-      {
-        sentence: 'The Girl with the Dragon Tattoo',
-        expected: ['five dysfunct', 'dysfunct team'],
-      },
-      {
-        sentence: 'The 7 Habits of Highly Effective People',
-        expected: ['five dysfunct', 'dysfunct team'],
-      },
-      {
-        sentence: 'Code that changes together stays together',
-        expected: ['five dysfunct', 'dysfunct team'],
-      },
-      {
-        sentence: "You rise to your level of your leadership's incompetence",
-        expected: ['five dysfunct', 'dysfunct team'],
-      },
-      {
-        sentence: 'Shortening the feedback cycle',
-        expected: ['five dysfunct', 'dysfunct team'],
-      },
-    ];
-
-    dataSet.forEach(({ sentence, expected }) => {
-      it(`Generates stemmed bigram tokens ("${sentence}", [${expected}]`, () => {
-        const bigramTokens = bigramStemmedTokens(sentence);
-        expect(bigramTokens).toEqual(expected);
-      });
-    });
-  });
-
-  describe('tokenizeText', () => {
-    it('Tokenize a sentence into an array of tokens', () => {
-      const paragraph = 'The "quick fox" jumps; ~over “the” _lazy dog.';
-
-      const tokens = tokenizeText(paragraph);
-
-      expect(tokens.map((t) => t.token).join('')).toEqual(paragraph);
-      expect(tokens.length).toEqual(16);
-    });
-
-    it('Tokenize a sentence with a line break', () => {
-      const paragraph = `This is a test note.
-
-spanning
-
-multiple lines`;
-
-      const tokens = tokenizeText(paragraph);
-
-      expect(tokens.map((t) => t.token).join('')).toEqual(paragraph);
-      expect(tokens.length).toEqual(15);
-    });
-
-    it('Tokenize a sentence with an apostrophe', () => {
-      const paragraph = `1. “Shared client record”`;
-
-      const tokens = tokenizeText(paragraph);
-
-      expect(tokens.map((t) => t.token).join('')).toEqual(paragraph);
-      expect(tokens.length).toEqual(8);
-    });
-
-    it('Stems a sentence', () => {
-      const paragraph =
-        'The quick brown fox jumps over the changing, patiently; waiting doggy.';
-      const expected = 'the quick brown fox jump over the chang, patient; wait doggi.';
-
-      const tokens = tokenizeText(paragraph);
-      const stems = stemTokens(tokens);
-
-      expect(stems.map((t) => t.stem).join('')).toEqual(expected);
-    });
-
-    it('Map emitted stems to root tokens', () => {
-      const paragraph = 'The connecting and the consulting spirit'; // Maps to 'the connect and the consult spirit'
-      const searchEmits: Emit[] = [new Emit(4, 11, 'connect'), new Emit(20, 27, 'consult')];
-
-      const expectedMappedEmits: Emit[] = [
-        new Emit(4, 14, 'connecting'),
-        new Emit(23, 33, 'consulting'),
-      ];
-
-      const stems = stemTokens(tokenizeText(paragraph));
-      const mappedEmits = mapStemmedEmitsToOriginal(stems, searchEmits);
-
-      expect(mappedEmits).toEqual(expectedMappedEmits);
-    });
-  });
-});
diff --git a/src/indexing/utils.ts b/src/indexing/utils.ts
deleted file mode 100644
index 80e7be3..0000000
--- a/src/indexing/utils.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-import LIB_TOKENIZE from '@liquicode/lib-tokenize';
-import { Emit } from '@tanishiking/aho-corasick';
-import { PorterStemmer, NGrams, WordPunctTokenizer } from 'natural';
-
-const tokenizer = LIB_TOKENIZE.NewTokenizer();
-tokenizer.whitespace = ` \t\r\n.“”`;
-tokenizer.symbols = `,;=`;
-tokenizer.literal_delimiters = `"`;
-tokenizer.literal_escape_chars = `\\`;
-
-const tokenizer2 = new WordPunctTokenizer();
-
-/**
- * Tokenizes a string into words along with:
- * (a) Removing stop words
- * (b) Removing punctuation
- * (c) Stemming words
- */
-export const tokenizeWithStem = (text: string): string[] => {
-  return PorterStemmer.tokenizeAndStem(text);
-};
-
-export const bigramStemmedTokens = (text: string): string[] => {
-  const tokens = tokenizeWithStem(text);
-
-  if (tokens.length > 2) {
-    const bigrams = NGrams.bigrams(tokens);
-    return bigrams.map((bigram) => bigram.join(' '));
-  }
-
-  return tokens;
-};
-
-type Token = {
-  at: number;
-  token: string;
-  type: 'wsp' | 'sym' | 'lit' | 'idf' | 'num' | 'kwd';
-};
-
-type StemmedToken = {
-  stem: string;
-  stemStart: number;
-  stemEnd: number;
-  original: string;
-  originalStart: number;
-  originalEnd: number;
-};
-
-export const tokenizeText = (text: string): Token[] => {
-  console.log(tokenizer2.tokenize(text))
-  console.log(tokenizer.tokenize(text))
-
-  return tokenizer.tokenize(text);
-};
-
-export const stemTokens = (tokens: Token[]): StemmedToken[] => {
-  let index = 0;
-
-  return tokens.reduce((acc, t) => {
-    const stem = PorterStemmer.stem(t.token);
-
-    acc.push({
-      stem,
-      stemStart: index,
-      stemEnd: index + stem.length,
-      original: t.token,
-      originalStart: t.at,
-      originalEnd: t.at + t.token.length,
-    });
-
-    index += stem.length;
-
-    return acc;
-  }, []);
-};
-
-export const mapStemmedEmitsToOriginal = (stems: StemmedToken[], emits: Emit[]): Emit[] => {
-  console.log('stemmed tokens', stems);
-
-  return emits.map((e) => {
-    console.log('looking for in tokens', e);
-    const matchingStem = stems.find((s) => s.stemStart === e.start);
-    return new Emit(matchingStem.originalStart, matchingStem.originalEnd, e.keyword);
-  });
-};
diff --git a/src/search/index.ts b/src/search/index.ts
index b539d3f..403fee7 100644
--- a/src/search/index.ts
+++ b/src/search/index.ts
@@ -1,19 +1,22 @@
 import _ from 'lodash';
-import { Trie, Emit } from '@tanishiking/aho-corasick';
+import { Trie } from '@tanishiking/aho-corasick';
 
-import { redactText } from './search.utils';
 import type { Indexer } from '../indexing/indexer';
+import { redactText } from './redactText';
+import { mapStemToOriginalText } from './mapStemToOriginalText';
+import { WordPunctStemTokenizer } from '../tokenizers';
 
-import { tokenizeText, stemTokens, mapStemmedEmitsToOriginal } from '../indexing/utils';
+const tokenizer = new WordPunctStemTokenizer();
 
-type SearchResult = {
+export type SearchResult = {
   start: number;
   end: number;
-  keyword: string;
+  indexKeyword: string;
+  originalKeyword: string;
 };
 
-const isEqual = (a: Emit, b: Emit) => {
-  return a.start === b.start && a.keyword === b.keyword;
+const isEqual = (a: SearchResult, b: SearchResult) => {
+  return a.start === b.start && a.indexKeyword === b.indexKeyword;
 };
 
 export default class Search {
@@ -38,40 +41,20 @@ export default class Search {
   public find(text: string): SearchResult[] {
     const redactedText = redactText(text); // Redact text that we don't want to be searched
 
-    console.log('redactedText');
-    console.log(redactedText);
-
     // Stem the text
-    const stemmedTokens = stemTokens(tokenizeText(redactedText));
-    const stemmedText = stemmedTokens.map((t) => t.stem).join('');
-
-    console.log('stemmedText');
-    console.log(stemmedText);
+    const tokens = tokenizer.tokenize(redactedText);
+    const stemmedText = tokens.map((t) => t.stem).join('');
 
     // Search stemmed text
-    const stemmedResults = this.trie.parseText(stemmedText);
-
-    console.log('stemmedResults');
-    console.log(stemmedResults);
+    const emits = this.trie.parseText(stemmedText);
 
     // Map stemmed results to original text
-    const originalResults = mapStemmedEmitsToOriginal(stemmedTokens, stemmedResults);
-
-    console.log('originalResults');
-    console.log(originalResults);
-
-    return this.mapToSearchResults(originalResults);
-  }
-
-  private mapToSearchResults(results: Emit[]): SearchResult[] {
-    return _.uniqWith(results, isEqual)
-      .filter((result) => this.keywordExistsInIndex(result.keyword))
-      .map((result) => ({
-        start: result.start,
-        end: result.end + 1,
-        keyword: result.keyword,
-      }))
-      .sort((a, b) => a.start - b.start); // Must sort by start position to prepare for highlighting
+    return _.chain(emits)
+      .map((emit) => mapStemToOriginalText(emit, tokens))
+      .uniqWith(isEqual)
+      .filter((result) => this.keywordExistsInIndex(result.indexKeyword))
+      .sort((a, b) => a.start - b.start) // Must sort by start position to prepare for highlighting
+      .value();
   }
 
   private keywordExistsInIndex(index: string): boolean {
diff --git a/src/search/mapStemToOriginalText.ts b/src/search/mapStemToOriginalText.ts
new file mode 100644
index 0000000..ab323b0
--- /dev/null
+++ b/src/search/mapStemToOriginalText.ts
@@ -0,0 +1,28 @@
+import { Emit } from '@tanishiking/aho-corasick';
+
+import { SearchResult } from '../search/index';
+import { Token } from '../tokenizers';
+
+/**
+ * Takes a given search result (which has the start/end position and a "stemmed" keyword)
+ * that was matched, and maps them to a new start/end position for the original keyword
+ * which was stem was created from
+ * @param searchResult
+ * @param tokens
+ * @returns
+ */
+export const mapStemToOriginalText = (searchResult: Emit, tokens: Token[]): SearchResult => {
+  const matchingTokens = tokens.filter(
+    (token) => token.stemStart >= searchResult.start && token.stemEnd <= searchResult.end + 1
+  );
+
+  return {
+    start: matchingTokens[0].originalStart,
+    end: matchingTokens[matchingTokens.length - 1].originalEnd,
+    indexKeyword: matchingTokens
+      .map((token) => token.stem)
+      .join('')
+      .toLowerCase(),
+    originalKeyword: matchingTokens.map((token) => token.originalText).join(''),
+  };
+};
diff --git a/src/search/search.utils.spec.ts b/src/search/redactText.spec.ts
similarity index 97%
rename from src/search/search.utils.spec.ts
rename to src/search/redactText.spec.ts
index 3639ed3..884e458 100644
--- a/src/search/search.utils.spec.ts
+++ b/src/search/redactText.spec.ts
@@ -1,4 +1,4 @@
-import { redactText } from './search.utils';
+import { redactText } from './redactText';
 
 describe('redactText', () => {
   it('Hashtags are redacted', () => {
diff --git a/src/search/search.utils.ts b/src/search/redactText.ts
similarity index 100%
rename from src/search/search.utils.ts
rename to src/search/redactText.ts
diff --git a/src/search/search.spec.ts b/src/search/search.spec.ts
new file mode 100644
index 0000000..d993249
--- /dev/null
+++ b/src/search/search.spec.ts
@@ -0,0 +1,81 @@
+import { Indexer } from '../indexing/indexer';
+import Search from './index';
+
+const getKeywordsMockFn = jest.fn();
+
+jest.mock('../indexing/indexer', () => {
+  return {
+    Indexer: jest.fn().mockImplementation(() => {
+      return {
+        getKeywords: getKeywordsMockFn,
+        getDocumentsByKeyword: () => [{}],
+      };
+    }),
+  };
+});
+
+beforeEach(() => {
+  jest.clearAllMocks();
+});
+
+describe('Search class', () => {
+  it('Highlights single keywords that can be stemmed', () => {
+    getKeywordsMockFn.mockReturnValue(['search', 'note']);
+    const text = 'This is a note that I will be use for searching';
+
+    const indexer = new Indexer(null);
+    const search = new Search(indexer);
+    const results = search.find(text);
+
+    expect(results).toEqual([
+      {
+        start: 10,
+        end: 14,
+        indexKeyword: 'note',
+        originalKeyword: 'note',
+      },
+      {
+        start: 38,
+        end: 47,
+        indexKeyword: 'search',
+        originalKeyword: 'searching',
+      },
+    ]);
+  });
+
+  it('Longer keyword matches are always prioritised for highlight', () => {
+    getKeywordsMockFn.mockReturnValue(['github', 'github fork']);
+    const text = 'I use GitHub Forks as part of my development flow';
+
+    const indexer = new Indexer(null);
+    const search = new Search(indexer);
+    const results = search.find(text);
+
+    expect(results).toEqual([
+      {
+        start: 6,
+        end: 18,
+        indexKeyword: 'github fork',
+        originalKeyword: 'GitHub Forks',
+      },
+    ]);
+  });
+
+  it('Three word keyword is highlighted', () => {
+    getKeywordsMockFn.mockReturnValue(['shared', 'client', 'record', 'share client record']);
+    const text = 'Designing a shared client record is a great idea but challenging';
+
+    const indexer = new Indexer(null);
+    const search = new Search(indexer);
+    const results = search.find(text);
+
+    expect(results).toEqual([
+      {
+        start: 12,
+        end: 32,
+        indexKeyword: 'share client record',
+        originalKeyword: 'shared client record',
+      },
+    ]);
+  });
+});
diff --git a/src/stemmers/index.ts b/src/stemmers/index.ts
new file mode 100644
index 0000000..8c3bc9a
--- /dev/null
+++ b/src/stemmers/index.ts
@@ -0,0 +1,21 @@
+import { PorterStemmer } from 'natural';
+
+import { WordPunctStemTokenizer } from '../tokenizers';
+
+/**
+ * Stem a given phrase. If the phrase is made up of multiple words,
+ * the last word in the phrase is the only one that will be stemmed
+ * @param text input text
+ * @returns stemmed text
+ */
+export const stemLastWord = (text: string): string => {
+  return PorterStemmer.stem(text);
+};
+
+export const stemPhrase = (text: string): string => {
+  const tokenizer = new WordPunctStemTokenizer();
+  return tokenizer
+    .tokenize(text)
+    .map((t) => t.stem)
+    .join('');
+};
diff --git a/src/tokenizers/index.ts b/src/tokenizers/index.ts
new file mode 100644
index 0000000..8291720
--- /dev/null
+++ b/src/tokenizers/index.ts
@@ -0,0 +1,86 @@
+import _ from 'lodash';
+import { PorterStemmer, NGrams } from 'natural';
+import { Trie } from '@tanishiking/aho-corasick';
+import * as natural from 'natural';
+
+import { stemLastWord } from '../stemmers';
+
+export type Token = {
+  index: number;
+  originalText: string;
+  originalStart: number;
+  originalEnd: number;
+  stem: string;
+  stemStart: number;
+  stemEnd: number;
+};
+
+export class WordPermutationsTokenizer {
+  private trie: Trie;
+
+  constructor() {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    const stopWords: string[] = (natural as any).stopwords;
+
+    this.trie = new Trie(stopWords, {
+      allowOverlaps: false,
+      onlyWholeWords: true,
+      caseInsensitive: true,
+    });
+  }
+
+  public tokenize(text: string): string[] {
+    const tokens = PorterStemmer.tokenizeAndStem(text); // Strip punctuation and stop words, stem remaining words
+
+    if (tokens.length >= 5) {
+      return [...tokens, ...NGrams.bigrams(tokens).map((tokens) => tokens.join(' '))];
+    }
+
+    return this.combinations(tokens, 2, 2);
+  }
+
+  private combinations(arr: string[], min: number, max: number) {
+    return [...Array(max).keys()]
+      .reduce((result) => {
+        return arr.concat(
+          result.flatMap((val) =>
+            arr.filter((char) => char !== val).map((char) => `${val} ${char}`)
+          )
+        );
+      }, [])
+      .filter((val) => val.length >= min);
+  }
+}
+
+export class WordPunctStemTokenizer {
+  private pattern = /([\s]+|[A-zÀ-ÿ-]+|[0-9._]+|.|!|\?|'|"|:|;|,|-)/i;
+
+  public tokenize(text: string): Token[] {
+    const tokens = text.split(this.pattern);
+    return _.chain(tokens).without('').transform(this.stringToTokenAccumulator()).value();
+  }
+
+  private stringToTokenAccumulator() {
+    let originalCharIndex = 0;
+    let stemCharIndex = 0;
+
+    return (acc: Token[], token: string, index: number) => {
+      const stemmedToken = stemLastWord(token);
+
+      acc.push({
+        index,
+        originalText: token,
+        originalStart: originalCharIndex,
+        originalEnd: originalCharIndex + token.length,
+        stem: stemmedToken,
+        stemStart: stemCharIndex,
+        stemEnd: stemCharIndex + stemmedToken.length,
+      });
+
+      originalCharIndex += token.length;
+      stemCharIndex += stemmedToken.length;
+
+      return acc;
+    };
+  }
+}
diff --git a/src/tokenizers/tokenizer.spec.ts b/src/tokenizers/tokenizer.spec.ts
new file mode 100644
index 0000000..64d1fb4
--- /dev/null
+++ b/src/tokenizers/tokenizer.spec.ts
@@ -0,0 +1,93 @@
+import { WordPermutationsTokenizer, WordPunctStemTokenizer } from '.';
+
+describe('WordPermutationsTokenizer', () => {
+  const dataSet = [
+    {
+      description: 'Single word',
+      sentence: 'John',
+      expected: ['john'],
+    },
+    {
+      description: 'Two words with no stop words',
+      sentence: 'John    Doe',
+      expected: ['john', 'doe', 'john doe', 'doe john'],
+    },
+    {
+      description: 'Two words (with one stop word at the start)',
+      sentence: 'The brothers Karamazov',
+      expected: ['brother', 'karamazov', 'brother karamazov', 'karamazov brother'],
+    },
+    {
+      description: 'Two words (with stop words throughout the sentence)',
+      sentence: 'An Officer and a Spy',
+      expected: ['offic', 'spy', 'offic spy', 'spy offic'],
+    },
+    {
+      description: 'Three words with no stop words',
+      sentence: 'GitHub Forking tutorial',
+      expected: [
+        'github',
+        'fork',
+        'tutori',
+        'github fork',
+        'github tutori',
+        'fork github',
+        'fork tutori',
+        'tutori github',
+        'tutori fork',
+      ],
+    },
+
+    {
+      description: 'Five words or more does not generate permutations',
+      sentence: 'Ten Arguments For Deleting Your Social Media Accounts Right Now',
+      expected: [
+        'ten',
+        'argument',
+        'delet',
+        'social',
+        'media',
+        'account',
+        'right',
+        'ten argument',
+        'argument delet',
+        'delet social',
+        'social media',
+        'media account',
+        'account right',
+      ],
+    },
+  ];
+
+  dataSet.forEach(({ description, sentence, expected }) => {
+    it(`Tokenize phase permutations (${description})`, () => {
+      const tokenizer = new WordPermutationsTokenizer();
+      const tokens = tokenizer.tokenize(sentence);
+
+      expect(tokens).toEqual(expected);
+    });
+  });
+});
+
+describe('WordPunctStemTokenizer', () => {
+  it('Tokenize and stem a simple phrase', () => {
+    const sentence = 'The lazy dog       jumped over the fence.';
+
+    const tokenizer = new WordPunctStemTokenizer();
+    const tokens = tokenizer.tokenize(sentence);
+
+    expect(tokens.length).toEqual(14);
+
+    expect(tokens[2]).toEqual({
+      index: 2,
+      originalText: 'lazy',
+      originalStart: 4,
+      originalEnd: 8,
+      stem: 'lazi',
+      stemStart: 4,
+      stemEnd: 8,
+    });
+
+    expect(tokens[6].stem).toEqual('jump');
+  });
+});

From 5b19203263cbf86a4aeb95f6f4ea91350f606a97 Mon Sep 17 00:00:00 2001
From: Hady Osman <hadyos@gmail.com>
Date: Sun, 27 Feb 2022 01:06:59 +1300
Subject: [PATCH 3/3] Remove @liquicode/lib-tokenize dependency

---
 package.json | 1 -
 1 file changed, 1 deletion(-)

diff --git a/package.json b/package.json
index 6eaa209..a1826e2 100644
--- a/package.json
+++ b/package.json
@@ -69,7 +69,6 @@
     "*.{js,css,md}": "prettier --write"
   },
   "dependencies": {
-    "@liquicode/lib-tokenize": "^0.1.4",
     "@tanishiking/aho-corasick": "^0.0.1",
     "@types/natural": "^5.1.0",
     "lodash": "^4.17.21",