Skip to content
This repository has been archived by the owner on Mar 10, 2023. It is now read-only.

Use hunspell library. #13

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 38 additions & 34 deletions pospell.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import subprocess
import sys
import tempfile
from unicodedata import category
from contextlib import redirect_stderr
from itertools import chain
from pathlib import Path
Expand All @@ -16,21 +17,14 @@
import polib
from docutils.parsers.rst import roles
from docutils.utils import new_document

from hunspell import Hunspell
from nltk.tokenize import TweetTokenizer
import regex

__version__ = "1.0.3"

DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}

try:
HUNSPELL_VERSION = subprocess.check_output(
["hunspell", "--version"], universal_newlines=True
).split("\n")[0]
except FileNotFoundError:
print("hunspell not found, please install hunspell.", file=sys.stderr)
exit(1)


class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
pass
Expand Down Expand Up @@ -130,6 +124,8 @@ def clear(po_path, line, drop_capitalized=False):
line = regex.sub(r"\s+", " ", line)
to_drop = {
r'<a href="[^"]*?">',
r"</a>",
r"\w*@\w*", # Emails and various handles (docs@, @sizeof, ...)
# Strip accronyms
r"\b[\w-]*\p{Uppercase}{2,}[0-9.\w-]*\b",
r"---?", # -- and --- separators to be ignored
Expand Down Expand Up @@ -241,6 +237,15 @@ def parse_args():
return args


def should_ignore(word):
if all(category(c)[0] in "PZ" for c in word):
# print(f"Skipping {word}")
return True
if any(category(c)[0] == "N" for c in word):
return True
return False


def spell_check(
po_files, personal_dict, language, drop_capitalized=False, debug_only=False
):
Expand All @@ -252,32 +257,31 @@ def spell_check(
"""
errors = 0
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdir = Path(tmpdirname)
for po_file in po_files:
if debug_only:
print(po_to_text(str(po_file), drop_capitalized))
continue
(tmpdir / po_file.name).write_text(
po_to_text(str(po_file), drop_capitalized)
)
try:
output = subprocess.check_output(
["hunspell", "-d", language]
+ personal_dict_arg
+ ["-u3", str(tmpdir / po_file.name)],
universal_newlines=True,
)
except subprocess.CalledProcessError:
return -1
for line in output.split("\n"):
match = regex.match(
r"(?P<path>.*):(?P<line>[0-9]+): Locate: (?P<error>.*) \| Try: .*$",
line,
)
if match:
hunspell = Hunspell(language, hunspell_data_dir="/usr/share/hunspell")
with open(personal_dict) as personal_dict_file:
whitelist = {line.strip() for line in personal_dict_file.readlines()}
tknzr = TweetTokenizer()
for po_file in po_files:
text_to_check = po_to_text(str(po_file), drop_capitalized)
if debug_only:
print(text_to_check)
continue
for line_no, line in enumerate(text_to_check.split("\n")):
line = line.replace("’", "'")
for word in tknzr.tokenize(line):
if len(word) == 1:
continue
if word.lower() in whitelist:
continue
if not hunspell.spell(word) and not should_ignore(word):
errors += 1
print(po_file, match.group("line"), match.group("error"), sep=":")
suggestion = hunspell.suggest(word)
if not word:
print(f"{po_file}:{line_no}: {word!r}")
else:
print(
f"{po_file}:{line_no}: {word!r}, suggestions: {', '.join(suggestion)}"
)
return errors


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
extras_require={
"dev": ["bandit", "black", "detox", "flake8", "isort", "mypy", "pylint"]
},
install_requires=["polib", "docutils>=0.11", "regex"],
install_requires=["polib", "docutils>=0.11", "regex", "cyhunspell", "nltk"],
license="MIT license",
keywords="po spell gettext reStructuredText check sphinx translation",
classifiers=[
Expand Down