Skip to content

Commit

Permalink
Fix token was not concatenated correctly.
Browse files Browse the repository at this point in the history
Add standalone cmake Python module build(for developer)
Remove leading '\t' in feature string.
  • Loading branch information
syoyo committed Jan 20, 2024
1 parent 01ca3f0 commit 57e54fb
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 34 deletions.
51 changes: 46 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
cmake_minimum_required(VERSION 3.16)

set(EXE_TARGET "jagger")
set(PY_TARGET "jagger_ext")
project(${EXE_TARGET} CXX)

option(JAGGER_WITH_PYTHON "Build Python module(For developer)." On)
option(
JAGGER_PREFER_LOCAL_PYTHON_INSTALLATION
"Prefer locally-installed Python interpreter than system or conda/brew installed Python. Please specify your Python interpreter with `Python3_EXECUTABLE` cmake option if you enable this option."
OFF)


# cmake modules
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/sanitizers)
Expand All @@ -12,15 +20,48 @@ set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

# Build standalone .so(for developer)
if (JAGGER_WITH_PYTHON)

if(JAGGER_PREFER_LOCAL_PYTHON_INSTALLATION)
#message(STATUS "Local Python")
set(Python3_FIND_FRAMEWORK NEVER) # Do not search framework python
set(Python3_FIND_STRATEGY LOCATION)
set(Python3_FIND_REGISTRY NEVER) # Windows only
else()
set(Python3_FIND_FRAMEWORK LAST
)# Prefer Brew/Conda to Apple framework python
endif()

find_package(
Python3
COMPONENTS Interpreter Development
REQUIRED)

find_package(pybind11 CONFIG)

# pybind11 method:
pybind11_add_module(${PY_TARGET} jagger/python-binding-jagger.cc)

# copy .so to jagger/ after the build.
add_custom_command(
TARGET ${PY_TARGET}
POST_BUILD
COMMAND "${CMAKE_COMMAND}" -E copy "$<TARGET_FILE:${PY_TARGET}>"
"${CMAKE_SOURCE_DIR}/jagger/$<TARGET_FILE_NAME:${PY_TARGET}>"
COMMENT "copying jagger python module file to jagger/"
VERBATIM)

endif()


add_executable(${EXE_TARGET} cpp_cli/jagger-app.cc)
add_sanitizers(${EXE_TARGET})

target_include_directories(${EXE_TARGET} PRIVATE jagger)

#target_compile_definitions(${EXE_TARGET} PRIVATE "JAGGER_DEFAULT_MODEL=\"/usr/local/lib/jagger/model/kwdlc\"")

#
#target_compile_definitions(${EXE_TARGET} PRIVATE "NUM_POS_FIELD=4")
#target_compile_definitions(${EXE_TARGET} PRIVATE "USE_JUMANDIC=1")
# enable mmap by default.
target_compile_definitions(${EXE_TARGET} PRIVATE "JAGGER_USE_MMAP_IO")

# [VisualStudio]
if(WIN32)
Expand Down
14 changes: 14 additions & 0 deletions bootstrap-cpp-python.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
curdir=`pwd`

builddir=${curdir}/build_python_module

rm -rf ${builddir}
mkdir ${builddir}

# set path to pybind11
# If you install pybind11 through pip, its usually installed to <site-package path>/pybind11.
pybind11_path=`python -c "import site; print (site.getsitepackages()[0])"`

CC=clang CXX=clang++ pybind11_DIR=${pybind11_path}/pybind11 cmake -B${builddir} -S. \
-DJAGGER_WITH_PYTHON=1 \
-DCMAKE_VERBOSE_MAKEFILE=1
8 changes: 2 additions & 6 deletions example/batch_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@

for toks in toks_list:
for tok in toks:
print(tok.surface(), tok.feature())
print(tok)

# NOTE: surface() string contains trailing whitespaces.
# Use split() or rsplit() to strip whitespaces if you dont want it.
# print("surface", tok.surface().rsplit()[0])

print("EOL")
print("EOS")

12 changes: 6 additions & 6 deletions example/simple_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
toks = tokenizer.tokenize(text)

for tok in toks:
print(tok.surface(), tok.feature())
# print surface + TAB + feature
print(tok)

# NOTE: surface() string contains trailing whitespaces.
# Use split() or rsplit() to strip whitespaces if you dont want it.
# print("surface", tok.surface().rsplit()[0])
# or you can print surface and feature independently.
#print(tok.surface(), tok.feature())

print("EOL")
print("EOS")


for tok in toks:
Expand All @@ -33,4 +33,4 @@
for i in range(tok.n_tags()):
print(" tag[{}] = {}".format(i, tok.tag(i)))

print("EOL")
print("EOS")
7 changes: 6 additions & 1 deletion jagger/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#from jagger_ext import *
from jagger_ext import *

# load setptools_scm generated _version.py
try:
Expand All @@ -24,6 +24,11 @@ def tokenize(self, s: str):
return self._tagger.tokenize(s)

def tokenize_batch(self, s: str):
if isinstance(s, list):
s = '\n'.join(s)
# strip redundant '\n'(if input is a list of text which endswith '\n'
s.replace('\n\n', '\n')

return self._tagger.tokenize_batch(s)

def set_threads(self, n: int):
Expand Down
16 changes: 15 additions & 1 deletion jagger/jagger.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@
#include <windows.h>
#endif

#if !defined(_WIN32)
#include <unistd.h>
#endif

#if defined(JAGGER_USE_MMAP_IO)
#if !defined(_WIN32)
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
//#include <err.h>
#endif
#endif
Expand All @@ -42,6 +45,17 @@
#include "config.h"
#endif

#ifndef JAGGER_DEFAULT_MODEL
#define JAGGER_DEFAULT_MODEL "model/kwdlc"
#endif

#ifndef NUM_POS_FIELD
// mecab style
#define NUM_POS_FIELD 4
#endif



static void my_errx(int retcode, const char *fmt, const char *s)
{
fprintf(stderr, "jagger: ");
Expand Down
36 changes: 24 additions & 12 deletions jagger/python-binding-jagger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,10 @@ class PyToken {
return std::string();
}

const std::string str() {
return _surface + "\t" + _feature;
}

private:
std::string _surface;

Expand Down Expand Up @@ -1096,18 +1100,18 @@ class tagger {
(offsets >> MAX_KEY_BITS) & 0x7f);
write_string(_ptr, ",*,*,*\n", 7);

toks.back().get_feature() = std::string(&fs[(offsets >> 34)],
(offsets >> MAX_KEY_BITS) & 0x7f);
toks.back().get_feature() = ltrim(std::string(&fs[(offsets >> 34)],
(offsets >> MAX_KEY_BITS) & 0x7f));

toks.back().get_feature() += ",*,*,*";
} else {
write_string(
_ptr, &fs[(offsets >> 34)],
(offsets >> (MAX_KEY_BITS + MAX_FEATURE_BITS)) & 0x3ff);

// strip '\n'
toks.back().get_feature() = rtrim(std::string(&fs[(offsets >> 34)],
(offsets >> (MAX_KEY_BITS + MAX_FEATURE_BITS)) & 0x3ff));
// feature contains leading '\t' and ending '\n'. we remove it.
toks.back().get_feature() = ltrim(rtrim(std::string(&fs[(offsets >> 34)],
(offsets >> (MAX_KEY_BITS + MAX_FEATURE_BITS)) & 0x3ff)));
}
concat = false;
} else {
Expand All @@ -1122,9 +1126,15 @@ class tagger {

// surface
write_string(_ptr, p, static_cast<size_t>(bytes));
PyToken tok;
tok.get_surface() = std::string(p, static_cast<size_t>(bytes));
toks.push_back(tok);

if (concat) {
// concat word to the surface of last token
toks.back().get_surface() += std::string(p, static_cast<size_t>(bytes));
} else {
PyToken tok;
tok.get_surface() = std::string(p, static_cast<size_t>(bytes));
toks.push_back(tok);
}
}
if (!bos) // output fs of last token
if (POS_TAGGING) {
Expand All @@ -1133,16 +1143,17 @@ class tagger {
(offsets >> MAX_KEY_BITS) & 0x7f);
write_string(_ptr, ",*,*,*\n", 7);

toks.back().get_feature() = std::string(&fs[(offsets >> 34)],
(offsets >> MAX_KEY_BITS) & 0x7f);
toks.back().get_feature() = ltrim(std::string(&fs[(offsets >> 34)],
(offsets >> MAX_KEY_BITS) & 0x7f));

toks.back().get_feature() += ",*,*,*";
} else {
write_string(
_ptr, &fs[(offsets >> 34)],
(offsets >> (MAX_KEY_BITS + MAX_FEATURE_BITS)) & 0x3ff);
toks.back().get_feature() = rtrim(std::string(&fs[(offsets >> 34)],
(offsets >> (MAX_KEY_BITS + MAX_FEATURE_BITS)) & 0x3ff));
// feature contains leading '\t' and ending '\n'. we remove it.
toks.back().get_feature() = ltrim(rtrim(std::string(&fs[(offsets >> 34)],
(offsets >> (MAX_KEY_BITS + MAX_FEATURE_BITS)) & 0x3ff)));
}
}
write_string(_ptr, POS_TAGGING ? "EOS\n" : "\n", POS_TAGGING ? 4 : 1);
Expand Down Expand Up @@ -1333,6 +1344,7 @@ PYBIND11_MODULE(jagger_ext, m) {
.def("n_tags", &jagger::PyToken::n_tags)
.def("tag", &jagger::PyToken::tag)
.def("set_quote_char", &jagger::PyToken::set_quote_char)
.def("__repr__", &jagger::PyToken::str)
;

}
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
dev_mode = False

jagger_compile_args=[
'-DJAGGER_DEFAULT_MODEL="/usr/local/lib/jagger/model/kwdlc"',
'-DNUM_POS_FIELD=4',
]

if sys.platform.startswith('win32'):
Expand All @@ -33,7 +31,8 @@
setup(
name="jagger",
packages=['jagger'],
version="v0.1.17",
# version is now set by setuptools_scm
#version="v0.1.17",
ext_modules=ext_modules,
long_description=open("./README.md", 'r', encoding='utf8').read(),
long_description_content_type='text/markdown',
Expand Down

0 comments on commit 57e54fb

Please sign in to comment.