From fc3e85a7a8b06626d0aa39c31d46ee9bbc7fefbd Mon Sep 17 00:00:00 2001 From: Hady Osman Date: Thu, 24 Feb 2022 22:14:33 +1300 Subject: [PATCH 1/3] Initial work in progress --- package.json | 1 + src/components/suggestionsPopup.ts | 22 ++-- src/indexing/indexer.ts | 15 ++- src/indexing/utils.spec.ts | 169 ++++++++++++++++++++++++----- src/indexing/utils.ts | 86 ++++++++++++++- src/search/index.ts | 26 ++++- webpack.config.ts | 1 + 7 files changed, 276 insertions(+), 44 deletions(-) diff --git a/package.json b/package.json index dcab5f8..b6781c9 100644 --- a/package.json +++ b/package.json @@ -69,6 +69,7 @@ "*.{js,css,md}": "prettier --write" }, "dependencies": { + "@liquicode/lib-tokenize": "^0.1.4", "@tanishiking/aho-corasick": "^0.0.1", "@types/natural": "^5.1.0", "lodash": "^4.17.21", diff --git a/src/components/suggestionsPopup.ts b/src/components/suggestionsPopup.ts index 7eee461..9c81b32 100644 --- a/src/components/suggestionsPopup.ts +++ b/src/components/suggestionsPopup.ts @@ -14,16 +14,18 @@ const item = (icon, title, click) => { export const showSuggestionsModal = (props: SuggestionsModalProps): void => { const { app, mouseEvent, suggestions, onClick } = props; - const menu = new Menu(app); + setTimeout(() => { + const menu = new Menu(app); - suggestions.forEach((replaceText) => { - menu.addItem( - item('pencil', `Replace with ${replaceText}`, () => { - onClick(replaceText); - }) - ); - }); + suggestions.forEach((replaceText) => { + menu.addItem( + item('pencil', `Replace with ${replaceText}`, () => { + onClick(replaceText); + }) + ); + }); - menu.addSeparator(); - menu.showAtMouseEvent(mouseEvent); + menu.addSeparator(); + menu.showAtMouseEvent(mouseEvent); + }, 100); }; diff --git a/src/indexing/indexer.ts b/src/indexing/indexer.ts index 787ae00..8244793 100644 --- a/src/indexing/indexer.ts +++ b/src/indexing/indexer.ts @@ -1,14 +1,16 @@ +import _ from 'lodash'; import lokijs from 'lokijs'; import { TypedEmitter } from 'tiny-typed-emitter'; import type { TFile } from 'obsidian'; -import { tokenize } from './utils'; +import { tokenizeWithStem } from './utils'; import type { PluginHelper } from '../plugin-helper'; type Document = { fileCreationTime: number; type: 'tag' | 'alias' | 'page' | 'page-token'; keyword: string; + originalText: string; replaceText: string; }; @@ -34,9 +36,11 @@ export class Indexer extends TypedEmitter { // Exclude any keywords associated with active file as we don't want recursive highlighting const exclusionFile = this.pluginHelper.activeFile; - return this.documents + const keywords = this.documents .where((doc) => doc.fileCreationTime !== exclusionFile.stat.ctime) .map((doc) => doc.keyword); + + return _.uniq(keywords); } public getDocumentsByKeyword(keyword: string): Document[] { @@ -45,6 +49,7 @@ export class Indexer extends TypedEmitter { public buildIndex(): void { this.pluginHelper.getAllFiles().forEach((file) => this.indexFile(file)); + console.log('index has been built', this.documents); this.emit('indexRebuilt'); } @@ -63,14 +68,16 @@ export class Indexer extends TypedEmitter { fileCreationTime: file.stat.ctime, type: 'page', keyword: file.basename.toLowerCase(), + originalText: file.basename, replaceText: `[[${file.basename}]]`, }); - tokenize(file.basename).forEach((token) => { + tokenizeWithStem(file.basename).forEach((token) => { this.documents.insert({ fileCreationTime: file.stat.ctime, type: 'page-token', keyword: token, + originalText: file.basename, replaceText: `[[${file.basename}]]`, }); }); @@ -80,6 +87,7 @@ export class Indexer extends TypedEmitter { fileCreationTime: file.stat.ctime, type: 'alias', keyword: alias.toLowerCase(), + originalText: file.basename, replaceText: `[[${file.basename}|${alias}]]`, }); }); @@ -89,6 +97,7 @@ export class Indexer extends TypedEmitter { fileCreationTime: file.stat.ctime, type: 'tag', keyword: tag.replace(/#/, '').toLowerCase(), + originalText: tag, replaceText: tag, }); }); diff --git a/src/indexing/utils.spec.ts b/src/indexing/utils.spec.ts index ab71c50..45c7dc4 100644 --- a/src/indexing/utils.spec.ts +++ b/src/indexing/utils.spec.ts @@ -1,29 +1,146 @@ -import { tokenize } from './utils'; - -describe('tokenize', () => { - const dataSet = [ - { - sentence: 'The quick brown fox jumps over the lazy dog.', - expected: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog'], - }, - { - sentence: 'GitHub Forks', - expected: ['github', 'fork'], - }, - { - sentence: 'John Doe', - expected: ['john', 'doe'], - }, - { - sentence: 'Approximate Inference', - expected: ['approxim', 'infer'], - }, - ]; - - dataSet.forEach(({ sentence, expected }) => { - it(`Tokenizes and removes stop words ("${sentence}", [${expected}]`, () => { - const tokens = tokenize(sentence); - expect(tokens).toEqual(expected); +import { Emit } from '@tanishiking/aho-corasick'; + +import { + bigramStemmedTokens, + tokenizeWithStem, + tokenizeText, + stemTokens, + mapStemmedEmitsToOriginal, +} from './utils'; + +describe.only('utils', () => { + describe('tokenizeWithStem', () => { + const dataSet = [ + { + sentence: 'The quick brown fox jumps over the lazy dog.', + expected: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog'], + }, + { + sentence: 'GitHub Forks', + expected: ['github', 'fork'], + }, + { + sentence: 'John Doe', + expected: ['john', 'doe'], + }, + { + sentence: 'Approximate Inference', + expected: ['approxim', 'infer'], + }, + ]; + + dataSet.forEach(({ sentence, expected }) => { + it(`Tokenizes and removes stop words ("${sentence}", [${expected}]`, () => { + const tokens = tokenizeWithStem(sentence); + expect(tokens).toEqual(expected); + }); + }); + }); + + describe('bigramStemmedTokens', () => { + const dataSet = [ + { + sentence: 'John', + expected: ['john'], + }, + { + sentence: 'John Doe', + expected: ['john', 'doe'], + }, + { + sentence: 'GitHub Forking tutorial', + expected: ['github fork', 'fork tutori'], + }, + { + sentence: 'The Five Dysfunctions of a Team', + expected: ['five dysfunct', 'dysfunct team'], + }, + { + sentence: 'The Girl with the Dragon Tattoo', + expected: ['five dysfunct', 'dysfunct team'], + }, + { + sentence: 'The 7 Habits of Highly Effective People', + expected: ['five dysfunct', 'dysfunct team'], + }, + { + sentence: 'Code that changes together stays together', + expected: ['five dysfunct', 'dysfunct team'], + }, + { + sentence: "You rise to your level of your leadership's incompetence", + expected: ['five dysfunct', 'dysfunct team'], + }, + { + sentence: 'Shortening the feedback cycle', + expected: ['five dysfunct', 'dysfunct team'], + }, + ]; + + dataSet.forEach(({ sentence, expected }) => { + it(`Generates stemmed bigram tokens ("${sentence}", [${expected}]`, () => { + const bigramTokens = bigramStemmedTokens(sentence); + expect(bigramTokens).toEqual(expected); + }); + }); + }); + + describe('tokenizeText', () => { + it('Tokenize a sentence into an array of tokens', () => { + const paragraph = 'The "quick fox" jumps; ~over “the” _lazy dog.'; + + const tokens = tokenizeText(paragraph); + + expect(tokens.map((t) => t.token).join('')).toEqual(paragraph); + expect(tokens.length).toEqual(16); + }); + + it('Tokenize a sentence with a line break', () => { + const paragraph = `This is a test note. + +spanning + +multiple lines`; + + const tokens = tokenizeText(paragraph); + + expect(tokens.map((t) => t.token).join('')).toEqual(paragraph); + expect(tokens.length).toEqual(15); + }); + + it('Tokenize a sentence with an apostrophe', () => { + const paragraph = `1. “Shared client record”`; + + const tokens = tokenizeText(paragraph); + + expect(tokens.map((t) => t.token).join('')).toEqual(paragraph); + expect(tokens.length).toEqual(8); + }); + + it('Stems a sentence', () => { + const paragraph = + 'The quick brown fox jumps over the changing, patiently; waiting doggy.'; + const expected = 'the quick brown fox jump over the chang, patient; wait doggi.'; + + const tokens = tokenizeText(paragraph); + const stems = stemTokens(tokens); + + expect(stems.map((t) => t.stem).join('')).toEqual(expected); + }); + + it('Map emitted stems to root tokens', () => { + const paragraph = 'The connecting and the consulting spirit'; // Maps to 'the connect and the consult spirit' + const searchEmits: Emit[] = [new Emit(4, 11, 'connect'), new Emit(20, 27, 'consult')]; + + const expectedMappedEmits: Emit[] = [ + new Emit(4, 14, 'connecting'), + new Emit(23, 33, 'consulting'), + ]; + + const stems = stemTokens(tokenizeText(paragraph)); + const mappedEmits = mapStemmedEmitsToOriginal(stems, searchEmits); + + expect(mappedEmits).toEqual(expectedMappedEmits); }); }); }); diff --git a/src/indexing/utils.ts b/src/indexing/utils.ts index 3318cd4..80e7be3 100644 --- a/src/indexing/utils.ts +++ b/src/indexing/utils.ts @@ -1,5 +1,85 @@ -import natural from 'natural'; +import LIB_TOKENIZE from '@liquicode/lib-tokenize'; +import { Emit } from '@tanishiking/aho-corasick'; +import { PorterStemmer, NGrams, WordPunctTokenizer } from 'natural'; -export const tokenize = (text: string): string[] => { - return natural.PorterStemmer.tokenizeAndStem(text); +const tokenizer = LIB_TOKENIZE.NewTokenizer(); +tokenizer.whitespace = ` \t\r\n.“”`; +tokenizer.symbols = `,;=`; +tokenizer.literal_delimiters = `"`; +tokenizer.literal_escape_chars = `\\`; + +const tokenizer2 = new WordPunctTokenizer(); + +/** + * Tokenizes a string into words along with: + * (a) Removing stop words + * (b) Removing punctuation + * (c) Stemming words + */ +export const tokenizeWithStem = (text: string): string[] => { + return PorterStemmer.tokenizeAndStem(text); +}; + +export const bigramStemmedTokens = (text: string): string[] => { + const tokens = tokenizeWithStem(text); + + if (tokens.length > 2) { + const bigrams = NGrams.bigrams(tokens); + return bigrams.map((bigram) => bigram.join(' ')); + } + + return tokens; +}; + +type Token = { + at: number; + token: string; + type: 'wsp' | 'sym' | 'lit' | 'idf' | 'num' | 'kwd'; +}; + +type StemmedToken = { + stem: string; + stemStart: number; + stemEnd: number; + original: string; + originalStart: number; + originalEnd: number; +}; + +export const tokenizeText = (text: string): Token[] => { + console.log(tokenizer2.tokenize(text)) + console.log(tokenizer.tokenize(text)) + + return tokenizer.tokenize(text); +}; + +export const stemTokens = (tokens: Token[]): StemmedToken[] => { + let index = 0; + + return tokens.reduce((acc, t) => { + const stem = PorterStemmer.stem(t.token); + + acc.push({ + stem, + stemStart: index, + stemEnd: index + stem.length, + original: t.token, + originalStart: t.at, + originalEnd: t.at + t.token.length, + }); + + index += stem.length; + + return acc; + }, []); +}; + +export const mapStemmedEmitsToOriginal = (stems: StemmedToken[], emits: Emit[]): Emit[] => { + console.log('stemmed tokens', stems); + + return emits.map((e) => { + console.log('looking for in tokens', e); + const matchingStem = stems.find((s) => s.stemStart === e.start); + return new Emit(matchingStem.originalStart, matchingStem.originalEnd, e.keyword); + }); }; diff --git a/src/search/index.ts b/src/search/index.ts index 79944c8..7ab9e00 100644 --- a/src/search/index.ts +++ b/src/search/index.ts @@ -3,6 +3,8 @@ import { Trie, Emit } from '@tanishiking/aho-corasick'; import type { Indexer } from '../indexing/indexer'; +import { tokenizeText, stemTokens, mapStemmedEmitsToOriginal } from '../indexing/utils'; + type SearchResult = { start: number; end: number; @@ -35,9 +37,29 @@ export default class Search { public find(text: string): SearchResult[] { const redactedText = this.redactText(text); // Redact text that we don't want to be searched - const results = this.trie.parseText(redactedText); + console.log('redactedText'); + console.log(redactedText); + + // Stem the text + const stemmedTokens = stemTokens(tokenizeText(redactedText)); + const stemmedText = stemmedTokens.map((t) => t.stem).join(''); + + console.log('stemmedText'); + console.log(stemmedText); + + // Search stemmed text + const stemmedResults = this.trie.parseText(stemmedText); + + console.log('stemmedResults'); + console.log(stemmedResults); + + // Map stemmed results to original text + const originalResults = mapStemmedEmitsToOriginal(stemmedTokens, stemmedResults); + + console.log('originalResults'); + console.log(originalResults); - return this.mapToSearchResults(results); + return this.mapToSearchResults(originalResults); } private mapToSearchResults(results: Emit[]): SearchResult[] { diff --git a/webpack.config.ts b/webpack.config.ts index 1779028..09b828a 100644 --- a/webpack.config.ts +++ b/webpack.config.ts @@ -72,6 +72,7 @@ const config: Configuration = { '@codemirror/view': 'commonjs2 @codemirror/view', '@codemirror/state': 'commonjs2 @codemirror/state', '@codemirror/rangeset': 'commonjs2 @codemirror/rangeset', + 'webworker-threads': 'require(webworker-threads)', }, }; From 6517263a6bfdf56ac7e4430632bda5f049e106f7 Mon Sep 17 00:00:00 2001 From: Hady Osman Date: Sun, 27 Feb 2022 01:04:45 +1300 Subject: [PATCH 2/3] Re-engineer index to match both partial and multiple words --- manifest.json | 2 +- package.json | 2 +- src/cmExtension/suggestionsExtension.ts | 10 +- src/indexing/indexer.ts | 11 +- src/indexing/utils.spec.ts | 146 ------------------ src/indexing/utils.ts | 85 ---------- src/search/index.ts | 55 +++---- src/search/mapStemToOriginalText.ts | 28 ++++ ...earch.utils.spec.ts => redactText.spec.ts} | 2 +- src/search/{search.utils.ts => redactText.ts} | 0 src/search/search.spec.ts | 81 ++++++++++ src/stemmers/index.ts | 21 +++ src/tokenizers/index.ts | 86 +++++++++++ src/tokenizers/tokenizer.spec.ts | 93 +++++++++++ 14 files changed, 343 insertions(+), 279 deletions(-) delete mode 100644 src/indexing/utils.spec.ts delete mode 100644 src/indexing/utils.ts create mode 100644 src/search/mapStemToOriginalText.ts rename src/search/{search.utils.spec.ts => redactText.spec.ts} (97%) rename src/search/{search.utils.ts => redactText.ts} (100%) create mode 100644 src/search/search.spec.ts create mode 100644 src/stemmers/index.ts create mode 100644 src/tokenizers/index.ts create mode 100644 src/tokenizers/tokenizer.spec.ts diff --git a/manifest.json b/manifest.json index e30c5b6..e474c4e 100644 --- a/manifest.json +++ b/manifest.json @@ -2,7 +2,7 @@ "id": "obsidian-sidekick", "name": "Sidekick", "description": "A companion to identify hidden connections that match your tags and pages", - "version": "1.4.3", + "version": "1.5.0", "minAppVersion": "0.13.8", "author": "Hady Osman", "authorUrl": "https://hady.geek.nz", diff --git a/package.json b/package.json index d8ae268..6eaa209 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "obsidian-sidekick", - "version": "1.4.3", + "version": "1.5.0", "description": "A companion to identify hidden connections that match your tags and pages", "main": "src/index.ts", "repository": { diff --git a/src/cmExtension/suggestionsExtension.ts b/src/cmExtension/suggestionsExtension.ts index 718e8de..a43e0cb 100644 --- a/src/cmExtension/suggestionsExtension.ts +++ b/src/cmExtension/suggestionsExtension.ts @@ -16,11 +16,11 @@ import './suggestionsExtension.css'; const SuggestionCandidateClass = 'cm-suggestion-candidate'; -const underlineDecoration = (start: number, end: number, keyword: string) => +const underlineDecoration = (start: number, end: number, indexKeyword: string) => Decoration.mark({ class: SuggestionCandidateClass, attributes: { - 'data-keyword': keyword, + 'data-index-keyword': indexKeyword, 'data-position-start': `${start}`, 'data-position-end': `${end}`, }, @@ -68,7 +68,7 @@ export const suggestionsExtension = (search: Search, app: App): ViewPlugin { view.dispatch({ changes: { diff --git a/src/indexing/indexer.ts b/src/indexing/indexer.ts index 8244793..3fbc4c5 100644 --- a/src/indexing/indexer.ts +++ b/src/indexing/indexer.ts @@ -3,7 +3,8 @@ import lokijs from 'lokijs'; import { TypedEmitter } from 'tiny-typed-emitter'; import type { TFile } from 'obsidian'; -import { tokenizeWithStem } from './utils'; +import { stemPhrase } from '../stemmers'; +import { WordPermutationsTokenizer } from '../tokenizers'; import type { PluginHelper } from '../plugin-helper'; type Document = { @@ -21,6 +22,7 @@ interface IndexerEvents { export class Indexer extends TypedEmitter { private documents: Collection; + private permutationTokenizer: WordPermutationsTokenizer; constructor(private pluginHelper: PluginHelper) { super(); @@ -30,6 +32,8 @@ export class Indexer extends TypedEmitter { this.documents = db.addCollection('documents', { indices: ['fileCreationTime', 'keyword'], }); + + this.permutationTokenizer = new WordPermutationsTokenizer(); } public getKeywords(): string[] { @@ -49,7 +53,6 @@ export class Indexer extends TypedEmitter { public buildIndex(): void { this.pluginHelper.getAllFiles().forEach((file) => this.indexFile(file)); - console.log('index has been built', this.documents); this.emit('indexRebuilt'); } @@ -67,12 +70,12 @@ export class Indexer extends TypedEmitter { this.documents.insert({ fileCreationTime: file.stat.ctime, type: 'page', - keyword: file.basename.toLowerCase(), + keyword: stemPhrase(file.basename), originalText: file.basename, replaceText: `[[${file.basename}]]`, }); - tokenizeWithStem(file.basename).forEach((token) => { + this.permutationTokenizer.tokenize(file.basename).forEach((token) => { this.documents.insert({ fileCreationTime: file.stat.ctime, type: 'page-token', diff --git a/src/indexing/utils.spec.ts b/src/indexing/utils.spec.ts deleted file mode 100644 index 45c7dc4..0000000 --- a/src/indexing/utils.spec.ts +++ /dev/null @@ -1,146 +0,0 @@ -import { Emit } from '@tanishiking/aho-corasick'; - -import { - bigramStemmedTokens, - tokenizeWithStem, - tokenizeText, - stemTokens, - mapStemmedEmitsToOriginal, -} from './utils'; - -describe.only('utils', () => { - describe('tokenizeWithStem', () => { - const dataSet = [ - { - sentence: 'The quick brown fox jumps over the lazy dog.', - expected: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog'], - }, - { - sentence: 'GitHub Forks', - expected: ['github', 'fork'], - }, - { - sentence: 'John Doe', - expected: ['john', 'doe'], - }, - { - sentence: 'Approximate Inference', - expected: ['approxim', 'infer'], - }, - ]; - - dataSet.forEach(({ sentence, expected }) => { - it(`Tokenizes and removes stop words ("${sentence}", [${expected}]`, () => { - const tokens = tokenizeWithStem(sentence); - expect(tokens).toEqual(expected); - }); - }); - }); - - describe('bigramStemmedTokens', () => { - const dataSet = [ - { - sentence: 'John', - expected: ['john'], - }, - { - sentence: 'John Doe', - expected: ['john', 'doe'], - }, - { - sentence: 'GitHub Forking tutorial', - expected: ['github fork', 'fork tutori'], - }, - { - sentence: 'The Five Dysfunctions of a Team', - expected: ['five dysfunct', 'dysfunct team'], - }, - { - sentence: 'The Girl with the Dragon Tattoo', - expected: ['five dysfunct', 'dysfunct team'], - }, - { - sentence: 'The 7 Habits of Highly Effective People', - expected: ['five dysfunct', 'dysfunct team'], - }, - { - sentence: 'Code that changes together stays together', - expected: ['five dysfunct', 'dysfunct team'], - }, - { - sentence: "You rise to your level of your leadership's incompetence", - expected: ['five dysfunct', 'dysfunct team'], - }, - { - sentence: 'Shortening the feedback cycle', - expected: ['five dysfunct', 'dysfunct team'], - }, - ]; - - dataSet.forEach(({ sentence, expected }) => { - it(`Generates stemmed bigram tokens ("${sentence}", [${expected}]`, () => { - const bigramTokens = bigramStemmedTokens(sentence); - expect(bigramTokens).toEqual(expected); - }); - }); - }); - - describe('tokenizeText', () => { - it('Tokenize a sentence into an array of tokens', () => { - const paragraph = 'The "quick fox" jumps; ~over “the” _lazy dog.'; - - const tokens = tokenizeText(paragraph); - - expect(tokens.map((t) => t.token).join('')).toEqual(paragraph); - expect(tokens.length).toEqual(16); - }); - - it('Tokenize a sentence with a line break', () => { - const paragraph = `This is a test note. - -spanning - -multiple lines`; - - const tokens = tokenizeText(paragraph); - - expect(tokens.map((t) => t.token).join('')).toEqual(paragraph); - expect(tokens.length).toEqual(15); - }); - - it('Tokenize a sentence with an apostrophe', () => { - const paragraph = `1. “Shared client record”`; - - const tokens = tokenizeText(paragraph); - - expect(tokens.map((t) => t.token).join('')).toEqual(paragraph); - expect(tokens.length).toEqual(8); - }); - - it('Stems a sentence', () => { - const paragraph = - 'The quick brown fox jumps over the changing, patiently; waiting doggy.'; - const expected = 'the quick brown fox jump over the chang, patient; wait doggi.'; - - const tokens = tokenizeText(paragraph); - const stems = stemTokens(tokens); - - expect(stems.map((t) => t.stem).join('')).toEqual(expected); - }); - - it('Map emitted stems to root tokens', () => { - const paragraph = 'The connecting and the consulting spirit'; // Maps to 'the connect and the consult spirit' - const searchEmits: Emit[] = [new Emit(4, 11, 'connect'), new Emit(20, 27, 'consult')]; - - const expectedMappedEmits: Emit[] = [ - new Emit(4, 14, 'connecting'), - new Emit(23, 33, 'consulting'), - ]; - - const stems = stemTokens(tokenizeText(paragraph)); - const mappedEmits = mapStemmedEmitsToOriginal(stems, searchEmits); - - expect(mappedEmits).toEqual(expectedMappedEmits); - }); - }); -}); diff --git a/src/indexing/utils.ts b/src/indexing/utils.ts deleted file mode 100644 index 80e7be3..0000000 --- a/src/indexing/utils.ts +++ /dev/null @@ -1,85 +0,0 @@ -import LIB_TOKENIZE from '@liquicode/lib-tokenize'; -import { Emit } from '@tanishiking/aho-corasick'; -import { PorterStemmer, NGrams, WordPunctTokenizer } from 'natural'; - -const tokenizer = LIB_TOKENIZE.NewTokenizer(); -tokenizer.whitespace = ` \t\r\n.“”`; -tokenizer.symbols = `,;=`; -tokenizer.literal_delimiters = `"`; -tokenizer.literal_escape_chars = `\\`; - -const tokenizer2 = new WordPunctTokenizer(); - -/** - * Tokenizes a string into words along with: - * (a) Removing stop words - * (b) Removing punctuation - * (c) Stemming words - */ -export const tokenizeWithStem = (text: string): string[] => { - return PorterStemmer.tokenizeAndStem(text); -}; - -export const bigramStemmedTokens = (text: string): string[] => { - const tokens = tokenizeWithStem(text); - - if (tokens.length > 2) { - const bigrams = NGrams.bigrams(tokens); - return bigrams.map((bigram) => bigram.join(' ')); - } - - return tokens; -}; - -type Token = { - at: number; - token: string; - type: 'wsp' | 'sym' | 'lit' | 'idf' | 'num' | 'kwd'; -}; - -type StemmedToken = { - stem: string; - stemStart: number; - stemEnd: number; - original: string; - originalStart: number; - originalEnd: number; -}; - -export const tokenizeText = (text: string): Token[] => { - console.log(tokenizer2.tokenize(text)) - console.log(tokenizer.tokenize(text)) - - return tokenizer.tokenize(text); -}; - -export const stemTokens = (tokens: Token[]): StemmedToken[] => { - let index = 0; - - return tokens.reduce((acc, t) => { - const stem = PorterStemmer.stem(t.token); - - acc.push({ - stem, - stemStart: index, - stemEnd: index + stem.length, - original: t.token, - originalStart: t.at, - originalEnd: t.at + t.token.length, - }); - - index += stem.length; - - return acc; - }, []); -}; - -export const mapStemmedEmitsToOriginal = (stems: StemmedToken[], emits: Emit[]): Emit[] => { - console.log('stemmed tokens', stems); - - return emits.map((e) => { - console.log('looking for in tokens', e); - const matchingStem = stems.find((s) => s.stemStart === e.start); - return new Emit(matchingStem.originalStart, matchingStem.originalEnd, e.keyword); - }); -}; diff --git a/src/search/index.ts b/src/search/index.ts index b539d3f..403fee7 100644 --- a/src/search/index.ts +++ b/src/search/index.ts @@ -1,19 +1,22 @@ import _ from 'lodash'; -import { Trie, Emit } from '@tanishiking/aho-corasick'; +import { Trie } from '@tanishiking/aho-corasick'; -import { redactText } from './search.utils'; import type { Indexer } from '../indexing/indexer'; +import { redactText } from './redactText'; +import { mapStemToOriginalText } from './mapStemToOriginalText'; +import { WordPunctStemTokenizer } from '../tokenizers'; -import { tokenizeText, stemTokens, mapStemmedEmitsToOriginal } from '../indexing/utils'; +const tokenizer = new WordPunctStemTokenizer(); -type SearchResult = { +export type SearchResult = { start: number; end: number; - keyword: string; + indexKeyword: string; + originalKeyword: string; }; -const isEqual = (a: Emit, b: Emit) => { - return a.start === b.start && a.keyword === b.keyword; +const isEqual = (a: SearchResult, b: SearchResult) => { + return a.start === b.start && a.indexKeyword === b.indexKeyword; }; export default class Search { @@ -38,40 +41,20 @@ export default class Search { public find(text: string): SearchResult[] { const redactedText = redactText(text); // Redact text that we don't want to be searched - console.log('redactedText'); - console.log(redactedText); - // Stem the text - const stemmedTokens = stemTokens(tokenizeText(redactedText)); - const stemmedText = stemmedTokens.map((t) => t.stem).join(''); - - console.log('stemmedText'); - console.log(stemmedText); + const tokens = tokenizer.tokenize(redactedText); + const stemmedText = tokens.map((t) => t.stem).join(''); // Search stemmed text - const stemmedResults = this.trie.parseText(stemmedText); - - console.log('stemmedResults'); - console.log(stemmedResults); + const emits = this.trie.parseText(stemmedText); // Map stemmed results to original text - const originalResults = mapStemmedEmitsToOriginal(stemmedTokens, stemmedResults); - - console.log('originalResults'); - console.log(originalResults); - - return this.mapToSearchResults(originalResults); - } - - private mapToSearchResults(results: Emit[]): SearchResult[] { - return _.uniqWith(results, isEqual) - .filter((result) => this.keywordExistsInIndex(result.keyword)) - .map((result) => ({ - start: result.start, - end: result.end + 1, - keyword: result.keyword, - })) - .sort((a, b) => a.start - b.start); // Must sort by start position to prepare for highlighting + return _.chain(emits) + .map((emit) => mapStemToOriginalText(emit, tokens)) + .uniqWith(isEqual) + .filter((result) => this.keywordExistsInIndex(result.indexKeyword)) + .sort((a, b) => a.start - b.start) // Must sort by start position to prepare for highlighting + .value(); } private keywordExistsInIndex(index: string): boolean { diff --git a/src/search/mapStemToOriginalText.ts b/src/search/mapStemToOriginalText.ts new file mode 100644 index 0000000..ab323b0 --- /dev/null +++ b/src/search/mapStemToOriginalText.ts @@ -0,0 +1,28 @@ +import { Emit } from '@tanishiking/aho-corasick'; + +import { SearchResult } from '../search/index'; +import { Token } from '../tokenizers'; + +/** + * Takes a given search result (which has the start/end position and a "stemmed" keyword) + * that was matched, and maps them to a new start/end position for the original keyword + * which was stem was created from + * @param searchResult + * @param tokens + * @returns + */ +export const mapStemToOriginalText = (searchResult: Emit, tokens: Token[]): SearchResult => { + const matchingTokens = tokens.filter( + (token) => token.stemStart >= searchResult.start && token.stemEnd <= searchResult.end + 1 + ); + + return { + start: matchingTokens[0].originalStart, + end: matchingTokens[matchingTokens.length - 1].originalEnd, + indexKeyword: matchingTokens + .map((token) => token.stem) + .join('') + .toLowerCase(), + originalKeyword: matchingTokens.map((token) => token.originalText).join(''), + }; +}; diff --git a/src/search/search.utils.spec.ts b/src/search/redactText.spec.ts similarity index 97% rename from src/search/search.utils.spec.ts rename to src/search/redactText.spec.ts index 3639ed3..884e458 100644 --- a/src/search/search.utils.spec.ts +++ b/src/search/redactText.spec.ts @@ -1,4 +1,4 @@ -import { redactText } from './search.utils'; +import { redactText } from './redactText'; describe('redactText', () => { it('Hashtags are redacted', () => { diff --git a/src/search/search.utils.ts b/src/search/redactText.ts similarity index 100% rename from src/search/search.utils.ts rename to src/search/redactText.ts diff --git a/src/search/search.spec.ts b/src/search/search.spec.ts new file mode 100644 index 0000000..d993249 --- /dev/null +++ b/src/search/search.spec.ts @@ -0,0 +1,81 @@ +import { Indexer } from '../indexing/indexer'; +import Search from './index'; + +const getKeywordsMockFn = jest.fn(); + +jest.mock('../indexing/indexer', () => { + return { + Indexer: jest.fn().mockImplementation(() => { + return { + getKeywords: getKeywordsMockFn, + getDocumentsByKeyword: () => [{}], + }; + }), + }; +}); + +beforeEach(() => { + jest.clearAllMocks(); +}); + +describe('Search class', () => { + it('Highlights single keywords that can be stemmed', () => { + getKeywordsMockFn.mockReturnValue(['search', 'note']); + const text = 'This is a note that I will be use for searching'; + + const indexer = new Indexer(null); + const search = new Search(indexer); + const results = search.find(text); + + expect(results).toEqual([ + { + start: 10, + end: 14, + indexKeyword: 'note', + originalKeyword: 'note', + }, + { + start: 38, + end: 47, + indexKeyword: 'search', + originalKeyword: 'searching', + }, + ]); + }); + + it('Longer keyword matches are always prioritised for highlight', () => { + getKeywordsMockFn.mockReturnValue(['github', 'github fork']); + const text = 'I use GitHub Forks as part of my development flow'; + + const indexer = new Indexer(null); + const search = new Search(indexer); + const results = search.find(text); + + expect(results).toEqual([ + { + start: 6, + end: 18, + indexKeyword: 'github fork', + originalKeyword: 'GitHub Forks', + }, + ]); + }); + + it('Three word keyword is highlighted', () => { + getKeywordsMockFn.mockReturnValue(['shared', 'client', 'record', 'share client record']); + const text = 'Designing a shared client record is a great idea but challenging'; + + const indexer = new Indexer(null); + const search = new Search(indexer); + const results = search.find(text); + + expect(results).toEqual([ + { + start: 12, + end: 32, + indexKeyword: 'share client record', + originalKeyword: 'shared client record', + }, + ]); + }); +}); diff --git a/src/stemmers/index.ts b/src/stemmers/index.ts new file mode 100644 index 0000000..8c3bc9a --- /dev/null +++ b/src/stemmers/index.ts @@ -0,0 +1,21 @@ +import { PorterStemmer } from 'natural'; + +import { WordPunctStemTokenizer } from '../tokenizers'; + +/** + * Stem a given phrase. If the phrase is made up of multiple words, + * the last word in the phrase is the only one that will be stemmed + * @param text input text + * @returns stemmed text + */ +export const stemLastWord = (text: string): string => { + return PorterStemmer.stem(text); +}; + +export const stemPhrase = (text: string): string => { + const tokenizer = new WordPunctStemTokenizer(); + return tokenizer + .tokenize(text) + .map((t) => t.stem) + .join(''); +}; diff --git a/src/tokenizers/index.ts b/src/tokenizers/index.ts new file mode 100644 index 0000000..8291720 --- /dev/null +++ b/src/tokenizers/index.ts @@ -0,0 +1,86 @@ +import _ from 'lodash'; +import { PorterStemmer, NGrams } from 'natural'; +import { Trie } from '@tanishiking/aho-corasick'; +import * as natural from 'natural'; + +import { stemLastWord } from '../stemmers'; + +export type Token = { + index: number; + originalText: string; + originalStart: number; + originalEnd: number; + stem: string; + stemStart: number; + stemEnd: number; +}; + +export class WordPermutationsTokenizer { + private trie: Trie; + + constructor() { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const stopWords: string[] = (natural as any).stopwords; + + this.trie = new Trie(stopWords, { + allowOverlaps: false, + onlyWholeWords: true, + caseInsensitive: true, + }); + } + + public tokenize(text: string): string[] { + const tokens = PorterStemmer.tokenizeAndStem(text); // Strip punctuation and stop words, stem remaining words + + if (tokens.length >= 5) { + return [...tokens, ...NGrams.bigrams(tokens).map((tokens) => tokens.join(' '))]; + } + + return this.combinations(tokens, 2, 2); + } + + private combinations(arr: string[], min: number, max: number) { + return [...Array(max).keys()] + .reduce((result) => { + return arr.concat( + result.flatMap((val) => + arr.filter((char) => char !== val).map((char) => `${val} ${char}`) + ) + ); + }, []) + .filter((val) => val.length >= min); + } +} + +export class WordPunctStemTokenizer { + private pattern = /([\s]+|[A-zÀ-ÿ-]+|[0-9._]+|.|!|\?|'|"|:|;|,|-)/i; + + public tokenize(text: string): Token[] { + const tokens = text.split(this.pattern); + return _.chain(tokens).without('').transform(this.stringToTokenAccumulator()).value(); + } + + private stringToTokenAccumulator() { + let originalCharIndex = 0; + let stemCharIndex = 0; + + return (acc: Token[], token: string, index: number) => { + const stemmedToken = stemLastWord(token); + + acc.push({ + index, + originalText: token, + originalStart: originalCharIndex, + originalEnd: originalCharIndex + token.length, + stem: stemmedToken, + stemStart: stemCharIndex, + stemEnd: stemCharIndex + stemmedToken.length, + }); + + originalCharIndex += token.length; + stemCharIndex += stemmedToken.length; + + return acc; + }; + } +} diff --git a/src/tokenizers/tokenizer.spec.ts b/src/tokenizers/tokenizer.spec.ts new file mode 100644 index 0000000..64d1fb4 --- /dev/null +++ b/src/tokenizers/tokenizer.spec.ts @@ -0,0 +1,93 @@ +import { WordPermutationsTokenizer, WordPunctStemTokenizer } from '.'; + +describe('WordPermutationsTokenizer', () => { + const dataSet = [ + { + description: 'Single word', + sentence: 'John', + expected: ['john'], + }, + { + description: 'Two words with no stop words', + sentence: 'John Doe', + expected: ['john', 'doe', 'john doe', 'doe john'], + }, + { + description: 'Two words (with one stop word at the start)', + sentence: 'The brothers Karamazov', + expected: ['brother', 'karamazov', 'brother karamazov', 'karamazov brother'], + }, + { + description: 'Two words (with stop words throughout the sentence)', + sentence: 'An Officer and a Spy', + expected: ['offic', 'spy', 'offic spy', 'spy offic'], + }, + { + description: 'Three words with no stop words', + sentence: 'GitHub Forking tutorial', + expected: [ + 'github', + 'fork', + 'tutori', + 'github fork', + 'github tutori', + 'fork github', + 'fork tutori', + 'tutori github', + 'tutori fork', + ], + }, + + { + description: 'Five words or more does not generate permutations', + sentence: 'Ten Arguments For Deleting Your Social Media Accounts Right Now', + expected: [ + 'ten', + 'argument', + 'delet', + 'social', + 'media', + 'account', + 'right', + 'ten argument', + 'argument delet', + 'delet social', + 'social media', + 'media account', + 'account right', + ], + }, + ]; + + dataSet.forEach(({ description, sentence, expected }) => { + it(`Tokenize phase permutations (${description})`, () => { + const tokenizer = new WordPermutationsTokenizer(); + const tokens = tokenizer.tokenize(sentence); + + expect(tokens).toEqual(expected); + }); + }); +}); + +describe('WordPunctStemTokenizer', () => { + it('Tokenize and stem a simple phrase', () => { + const sentence = 'The lazy dog jumped over the fence.'; + + const tokenizer = new WordPunctStemTokenizer(); + const tokens = tokenizer.tokenize(sentence); + + expect(tokens.length).toEqual(14); + + expect(tokens[2]).toEqual({ + index: 2, + originalText: 'lazy', + originalStart: 4, + originalEnd: 8, + stem: 'lazi', + stemStart: 4, + stemEnd: 8, + }); + + expect(tokens[6].stem).toEqual('jump'); + }); +}); From 5b19203263cbf86a4aeb95f6f4ea91350f606a97 Mon Sep 17 00:00:00 2001 From: Hady Osman Date: Sun, 27 Feb 2022 01:06:59 +1300 Subject: [PATCH 3/3] Remove @liquicode/lib-tokenize dependency --- package.json | 1 - 1 file changed, 1 deletion(-) diff --git a/package.json b/package.json index 6eaa209..a1826e2 100644 --- a/package.json +++ b/package.json @@ -69,7 +69,6 @@ "*.{js,css,md}": "prettier --write" }, "dependencies": { - "@liquicode/lib-tokenize": "^0.1.4", "@tanishiking/aho-corasick": "^0.0.1", "@types/natural": "^5.1.0", "lodash": "^4.17.21",