From dd46852a4dd802da22a393614d3b0763da1a2904 Mon Sep 17 00:00:00 2001 From: Cheng Qian Date: Fri, 19 Jan 2024 11:01:02 -0500 Subject: [PATCH] feat: add zh-cn, zh-tw support --- README.md | 2 +- .../resources/zh-cn/stopwords | 155 ++++++++++++++++++ .../resources/zh-tw/stopwords | 51 ++++++ assistant_skill_analysis/utils/lang_utils.py | 18 +- classic_dialog_skill_analysis.ipynb | 2 +- classic_dialog_skill_analysis_cp4d.ipynb | 2 +- new_experience_skill_analysis.ipynb | 2 +- new_experience_skill_analysis_cp4d.ipynb | 2 +- requirements.txt | 3 +- tests/utils/test_lang_utils.py | 14 ++ 10 files changed, 244 insertions(+), 7 deletions(-) create mode 100644 assistant_skill_analysis/resources/zh-cn/stopwords create mode 100644 assistant_skill_analysis/resources/zh-tw/stopwords diff --git a/README.md b/README.md index 1f54e52..19fcd35 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Whether you are new to the process and are building your first AI assistant or y - Why is the assistant responding incorrectly to this question? - How do I improve my assistant’s ability to understand questions? -Currently Supported Languages: en, fr, cs, de, es, it, pt, nl +Currently Supported Languages: en, fr, cs, de, es, it, pt, nl, zh-cn, zh-tw ## Usage If you clone the notebook from this repository locally, please use the steps below. For usage in Watson studio, please refer to the diff --git a/assistant_skill_analysis/resources/zh-cn/stopwords b/assistant_skill_analysis/resources/zh-cn/stopwords new file mode 100644 index 0000000..1848efb --- /dev/null +++ b/assistant_skill_analysis/resources/zh-cn/stopwords @@ -0,0 +1,155 @@ +< +> +| +- +, +; +: +! +? +. +'' +' +" +( +) +[ +] +{ +} +* +% ++ +。 + +一 +一会儿 +一边 +一面 +上 +下 +不 +不但 +不光 +不可 +不如 +不是 +不管 +不论 +与 +与其 +个 +中 +为 +之 +之所以 +也 +也不 +也许 +也许是 +了 +于 +从 +他 +他们 +以 +会 +但 +你们 +便 +倘若 +先 +全 +其 +再 +到 +前 +十 +即使 +却 +又 +及 +只 +只有 +只要 +可 +可以 +可是 +可能 +各 +后 +向 +和 +哪怕 +因为 +因此 +在 +地 +多 +她 +她们 +如果 +宁可 +它 +它们 +对 +将 +小 +就 +尽管 +已 +已经 +并 +并且 +很 +我 +我们 +或 +所 +所以 +才 +把 +据 +无论 +既 +既然 +时 +是 +是因为 +更 +最 +有 +未 +来 +此 +每 +没有 +然后 +然而 +用 +由 +由于 +的 +看 +着 +种 +而 +而且 +而是 +能 +自己 +至 +虽然 +被 +要 +认为 +让 +该 +还 +还是 +这 +通过 +那么 +都 +非 +、 diff --git a/assistant_skill_analysis/resources/zh-tw/stopwords b/assistant_skill_analysis/resources/zh-tw/stopwords new file mode 100644 index 0000000..1cf8259 --- /dev/null +++ b/assistant_skill_analysis/resources/zh-tw/stopwords @@ -0,0 +1,51 @@ +the +of +is +and +to +in +that +we +for +an +are +by +be +as +on +with +can +if +from +which +you +it +this +then +at +have +all +not +one +has +or +that +的 +了 +和 +是 +就 +都 +而 +及 +與 +著 +或 +一個 +沒有 +我們 +你們 +妳們 +他們 +她們 +是否 \ No newline at end of file diff --git a/assistant_skill_analysis/utils/lang_utils.py b/assistant_skill_analysis/utils/lang_utils.py index 6483136..2b689d5 100644 --- a/assistant_skill_analysis/utils/lang_utils.py +++ b/assistant_skill_analysis/utils/lang_utils.py @@ -1,13 +1,15 @@ import os import re +from types import SimpleNamespace import sys +import jieba from nltk.stem.snowball import SnowballStemmer from spacy.tokenizer import Tokenizer import unicodedata import assistant_skill_analysis -SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl"] +SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl", "zh-cn", "zh-tw"] PUNCTUATION = [ "\\" + chr(i) for i in range(sys.maxunicode) @@ -15,6 +17,15 @@ ] +class _JiebaTokenizerWrapper: + """for zh-cn and zh-tw""" + + def __call__(self, *args, **kwargs): + text = args[0] + for token in jieba.tokenize(text): + yield SimpleNamespace(text=token[0]) + + class LanguageUtility: def __init__(self, language_code): if language_code not in SUPPORTED_LANGUAGE: @@ -96,6 +107,11 @@ def init_resources(self): self.tokenizer = Tokenizer(Dutch().vocab) self.stemmer = SnowballStemmer(language="dutch") self.stop_words = self.load_stop_words(stopwords_path) + + elif self.language_code in ["zh-cn", "zh-tw"]: + self.tokenizer = _JiebaTokenizerWrapper() + self.stop_words = self.load_stop_words(stopwords_path) + else: raise Exception("language code %s is not supported", self.language_code) diff --git a/classic_dialog_skill_analysis.ipynb b/classic_dialog_skill_analysis.ipynb index 674f4db..bda8ed1 100644 --- a/classic_dialog_skill_analysis.ipynb +++ b/classic_dialog_skill_analysis.ipynb @@ -73,7 +73,7 @@ "metadata": {}, "source": [ "Pick the language code correspond to your workspace data: \n", - "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**" + "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**" ] }, { diff --git a/classic_dialog_skill_analysis_cp4d.ipynb b/classic_dialog_skill_analysis_cp4d.ipynb index 7242d04..412cbc3 100644 --- a/classic_dialog_skill_analysis_cp4d.ipynb +++ b/classic_dialog_skill_analysis_cp4d.ipynb @@ -73,7 +73,7 @@ "metadata": {}, "source": [ "Pick the language code correspond to your workspace data: \n", - "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**" + "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**" ] }, { diff --git a/new_experience_skill_analysis.ipynb b/new_experience_skill_analysis.ipynb index 3874755..904dbae 100644 --- a/new_experience_skill_analysis.ipynb +++ b/new_experience_skill_analysis.ipynb @@ -80,7 +80,7 @@ "### Assistant Settings\n", "Please set values for the variables in the cell below to configure this notebook.\n", "\n", - "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n", + "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n", "\n", "- **ASSISTANT_ID:** id of the Watson Assistant service instance\n", "\n", diff --git a/new_experience_skill_analysis_cp4d.ipynb b/new_experience_skill_analysis_cp4d.ipynb index 757f4a7..3a58e48 100644 --- a/new_experience_skill_analysis_cp4d.ipynb +++ b/new_experience_skill_analysis_cp4d.ipynb @@ -80,7 +80,7 @@ "### Assistant Settings\n", "Please set values for the variables in the cell below to configure this notebook. The notebook uses CloudPakForDataAuthenticator to authenticate the APIs.\n", "\n", - "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n", + "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n", "\n", "- **ASSISTANT_ID:** id of the Watson Assistant service instance\n", "\n", diff --git a/requirements.txt b/requirements.txt index 960c872..2e4a227 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ scipy>=1.2.0 jupyter spacy~=2.3.2 ibm-cos-sdk>=2.11.0 -nbconvert>=7.7.1 \ No newline at end of file +nbconvert>=7.7.1 +jieba \ No newline at end of file diff --git a/tests/utils/test_lang_utils.py b/tests/utils/test_lang_utils.py index a32edbf..d07aa9e 100644 --- a/tests/utils/test_lang_utils.py +++ b/tests/utils/test_lang_utils.py @@ -61,6 +61,20 @@ def test_de(self): sent = util.tokenize(sent) self.assertEqual(sent, ["autobahn"]) + def test_zh_cn(self): + util = LanguageUtility("zh-cn") + sent = util.preprocess("不想当兼职") + self.assertEqual(sent, "不想当兼职") + sent = util.tokenize(sent) + self.assertEqual(sent, ['不想', '当', '兼职']) + + def test_zh_tw(self): + util = LanguageUtility("zh-tw") + sent = util.preprocess("畀到機會我嘗試") + self.assertEqual(sent, "畀到機會我嘗試") + sent = util.tokenize(sent) + self.assertEqual(sent, ['畀', '到', '機會', '我', '嘗試']) + def tearDown(self): unittest.TestCase.tearDown(self) self.skill_file.close()