diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml new file mode 100644 index 0000000..9287a1d --- /dev/null +++ b/.github/workflows/wheels.yml @@ -0,0 +1,79 @@ +name: Build Wheels + +on: [workflow_dispatch] + +jobs: + build_wheels: + name: Build wheels on for ${{matrix.python.cp}}-${{ matrix.buildplat.sys }} + runs-on: ${{ matrix.buildplat.runs_on }} + strategy: + matrix: + buildplat: + - { runs_on: ubuntu-22.04, sys: manylinux, arch: x86_64, benv: "" } + - { runs_on: macos-14, sys: macosx, arch: arm64, benv: "14.0" } + python: + - { cp: "cp310", rel: "3.10" } + - { cp: "cp311", rel: "3.11" } + - { cp: "cp312", rel: "3.12" } + - { cp: "cp313", rel: "3.13" } + + steps: + - uses: actions/checkout@v4.1.1 + + # Used to host cibuildwheel + - uses: actions/setup-python@v5 + with: + python-version: 3.12 + + - name: Install cibuildwheel + run: python -m pip install cibuildwheel + + - name: Build wheels (Linux glibc) + if: ${{ matrix.buildplat.sys == 'manylinux' }} + run: python -m cibuildwheel --output-dir wheelhouse + env: + CIBW_BUILD: ${{ matrix.python.cp }}-${{ matrix.buildplat.sys }}* + CIBW_ARCHS_LINUX: "x86_64" + CIBW_BEFORE_ALL_LINUX: > + if command -v apt-get; then + apt-get -y git libicu-dev libxml2-dev libxslt1-dev libbz2-dev zlib1g-dev autoconf automake autoconf-archive libtool autotools-dev gcc g++ make libboost-dev + elif command -v yum; then + yum install -y git libicu-devel libxml2-devel libxslt-devel zlib-devel bzip2-devel libtool autoconf-archive autoconf automake m4 wget cmake + fi && + ./build-deps.sh + CIBW_BEFORE_BUILD: ./build-boost-python.sh + CIBW_MANYLINUX_X86_64_IMAGE: quay.io/pypa/manylinux_2_28_x86_64 + CIBW_SKIP: "*-win* *-manylinux_i686 pp*" + + - name: Build wheels (Linux musl) + if: ${{ matrix.buildplat.sys == 'musllinux' }} + run: python -m cibuildwheel --output-dir wheelhouse + env: + CIBW_BUILD: ${{ matrix.python.cp }}-${{ matrix.buildplat.sys }}* + CIBW_ARCHS_LINUX: "x86_64" + CIBW_BEFORE_ALL_LINUX: > + apk add build-base git autoconf-archive autoconf automake libtool bzip2-dev icu-dev libxml2-dev boost-dev boost-python3 libtool rsync && + mkdir -p /usr/local/share/aclocal/ && rsync -av --ignore-existing /usr/share/aclocal/*.m4 /usr/local/share/aclocal/ && + ./build-deps.sh + CIBW_MUSLLINUX_X86_64_IMAGE: quay.io/pypa/musllinux_1_1_x86_64 + CIBW_MUSLLINUX_AARCH64_IMAGE: quay.io/pypa/musllinux_1_1_aarch64 + CIBW_SKIP: "*-win* *-manylinux_i686 pp*" + + - name: Build wheels (macOS) + if: ${{ runner.os == 'macOS' && matrix.python.cp == 'cp313' }} + run: python -m cibuildwheel --output-dir wheelhouse + env: + CIBW_BUILD: ${{ matrix.python.cp }}-${{ matrix.buildplat.sys }}* + CIBW_ARCHS: ${{ matrix.buildplat.arch }} + CIBW_ENVIRONMENT: "MACOSX_DEPLOYMENT_TARGET=${{ matrix.buildplat.benv }}" + CIBW_BEFORE_ALL_MACOS: > + brew install boost boost-python3 && + brew tap fbkarsdorp/homebrew-lamachine && + brew install timbl && + du -ah /opt/homebrew | grep boost_python + + - uses: actions/upload-artifact@v4 + if: ${{ ! (runner.os == 'macOS' && matrix.python.cp != 'cp313') }} + with: + name: ${{matrix.python.cp}}-${{matrix.buildplat.sys}}-${{matrix.buildplat.arch}} + path: ./wheelhouse/*.whl diff --git a/README.rst b/README.rst index fdb23a9..cd43574 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,17 @@ +.. image:: https://www.repostatus.org/badges/latest/active.svg + :alt: Project Status: Active – The project has reached a stable, usable state and is being actively developed. + :target: https://www.repostatus.org/#active + +.. image:: https://zenodo.org/badge/8136669.svg + :target: https://zenodo.org/badge/latestdoi/8136669 + ====================== README: python-timbl ====================== :Authors: Sander Canisius, Maarten van Gompel :Contact: proycon@anaproy.nl -:Web site: http://github.com/proycon/python-timbl/ +:Web site: https://github.com/proycon/python-timbl/ python-timbl is a Python extension module wrapping the full TiMBL C++ programming interface. With this module, all functionality exposed @@ -12,7 +19,9 @@ through the C++ interface is also available to Python scripts. Being able to access the API from Python greatly facilitates prototyping TiMBL-based applications. -This is the 2013 release by Maarten van Gompel, building on the 2006 release by Sander Canisius. For those used to the old library, there is one backwards-incompatible change, adapt your scripts to use ``import timblapi`` instead of ``import timbl``, as the latter is now a higher-level interface. +This is the 2013 release by Maarten van Gompel, building on the 2006 release by Sander Canisius. For those used to the old library, there is one backwards-incompatible change, adapt your scripts to use ``import timblapi`` instead of ``import timbl``, as the latter is now a higher-level interface. + +Since 2020, this only supports Python 3, Python 2 support has been deprecated. License ======= @@ -26,66 +35,34 @@ TiMBL. Installation ============ -python-timbl is distributed as part of **LaMachine** -(https://github.com/proycon/lamachine), which significantly simplifies -compilation and installation. The remainder of the instructions in this section -refer to manual compilation and installation. +In a Python virtual environment, run: + +``` +pip install python3-timbl +``` + +Note that on macOS, wheel packages are currently only available for Python +3.13, as this the the Python version Homebrew uses in linking libboost-python. + +If no wheels (binary packages) are available for your system, then this will +attempt to compile from source. If that is the case, a number of dependencies +are required: python-timbl depends on two external packages, which must have been built and/or installed on your system in order to successfully build python-timbl. The first is TiMBL itself; download its tarball from TiMBL's homepage and -follow the installation instructions, recent Ubuntu/Debian users will find -timbl in their distribution's package repository. In the remainder of this -section, it is assumed that ``$TIMBL_HEADERS`` points to the directory that -contains ``timbl/TimblAPI.h``, and ``$TIMBL_LIBS`` the directory that has -contains the Timbl libraries. Note that Timbl itself depends on additional -dependencies. - -The second prerequisite is Boost.Python, a library that facilitates writing +follow the installation instructions. The second prerequisite is Boost.Python, a library that facilitates writing Python extension modules in C++. Many Linux distributions come with prebuilt packages of Boost.Python. If so, install this package; on Ubuntu/Debian this -can be done as follows:: +can be done as follows. $ sudo apt-get install libboost-python libboost-python-dev -If not, refer to the `Boost installation instructions`_ to build and install -Boost.Python manually. In the remainder of this section, let ``$BOOST_HEADERS`` -refer to the directory that contains the Boost header files, and -``$BOOST_LIBS`` to the directory that contains the Boost library files. If you -installed Boost.Python with your distribution's package manager, these -directories are probably ``/usr/include`` and ``/usr/lib`` respectively. - -.. _Boost installation instructions: http://www.boost.org/more/getting_started.html - - -If both prerequisites have been installed on your system, python-timbl can be -obtained through github:: - - $ git clone git://github.com/proycon/python-timbl.git - $ cd python-timbl - -and can then be built and installed with the following command, use -``setup2.py`` for Python 2 and ``setup3.py`` for Python 3:: - - $ sudo python setup3.py \ - build_ext --boost-include-dir=$BOOST_HEADERS \ - --boost-library-dir=$BOOST_LIBS \ - --timbl-include-dir=$TIMBL_HEADERS \ - --timbl-library-dir=$TIMBL_LIBS \ - install --prefix=/dir/to/install/in - -This is the verbose variant, if default locations are used then the following may suffice already:: - - $ sudo python setup3.py install - - -The ``--prefix`` option to the install command denotes the directory in which the module is to be installed. If you have the appropriate system permissions, you can leave out this option. The module will then be installed in the Python system tree. Otherwise, make sure that the installation directory is in the module search path of your Python -system. Usage ======= -python-timbl offers two interface to the timbl API. A low-level interface contained in the module ``timblapi``, which is very much like the C++ library, and a high-level object oriented interface in the ``timbl`` module, which offers a ``TimblClassifier`` class. +python-timbl offers two interface to the timbl API. A low-level interface contained in the module ``timblapi``, which is very much like the C++ library, and a high-level object oriented interface in the ``timbl`` module, which offers a ``TimblClassifier`` class. timbl.TimblClassifier: High-level interface ---------------------------------------------- @@ -104,18 +81,18 @@ Training instances can be added using the ``append(featurevector, classlabel)`` classifier.append( (1,0,0), 'financial') classifier.append( (0,1,0), 'furniture') classifier.append( (0,0,1), 'geographic') - + Subsequently, you invoke the actual training, note that at each step Timbl may output considerable details about what it is doing to standard error output:: classifier.train() - + The results of this training is an instance base file, which you can save to file so you can load it again later:: classifier.save() - - classifier = timbl.TimblClassifier("wsd-bank", "-a 0 -k 1" ) - classifier.load() - + + classifier = timbl.TimblClassifier("wsd-bank", "-a 0 -k 1" ) + classifier.load() + The main advantage of the Python library is the fact that you can classify instances on the fly as follows, just pass a feature vector and optionally also a class label to ``classify(featurevector, classlabel)``:: @@ -129,7 +106,7 @@ You can also create a test file and test it all at once:: classifier.addinstance("testfile", (1,0,0),'financial' ) #addinstance can be used to add instances to external files (use append() for training) classifier.addinstance("testfile", (0,1,0),'furniture' ) classifier.addinstance("testfile", (0,0,1),'geograpic' ) - classifier.addinstance("testfile", (1,1,0),'geograpic' ) #this one will be wrongly classified as financial & furniture + classifier.addinstance("testfile", (1,1,0),'geograpic' ) #this one will be wrongly classified as financial & furniture classifier.test("testfile") print "Accuracy: ", classifier.getAccuracy() @@ -149,7 +126,7 @@ exists for this ``classify`` method. If you do not set this option, everything will still work fine, but you won't benefit from actual concurrency due to Python's the Global Interpret Lock. - + timblapi: Low-level interface ------------------------------- @@ -196,4 +173,6 @@ manually call the ``initthreading()`` method. Three TiMBL API methods print information to a standard C++ output stream object (ShowBestNeighbors, ShowOptions, ShowSettings, ShowSettings). In the Python interface, these methods will only work with Python (stream) objects that have a fileno method returning a valid file descriptor. Alternatively, three new methods are provided (bestNeighbo(u)rs, options, settings); these methods return the same information as a Python string object. +**scikit-learn wrapper** +A wrapper for use in scikit-learn has been added. It was designed for use in scikit-learn Pipeline objects. The wrapper is not finished and has to date only been tested on sparse data. Note that TiMBL does not work well with large amounts of features. It is suggested to reduce the amount of features to a number below 100 to keep system performance reasonable. Use on servers with large amounts of memory and processing cores advised. diff --git a/build-boost-python.sh b/build-boost-python.sh new file mode 100755 index 0000000..6bc6d4d --- /dev/null +++ b/build-boost-python.sh @@ -0,0 +1,18 @@ +#!/bin/sh + + +# build boost-python from source on AlmaLinux 8 in manylinux_2_28 container (do not use in other contexts) + +set -e + +#var gets set bu cibuildwheel, assign to PYTHON_HOME for boost +export PYTHON_HOME=$Python_ROOT_DIR + +cd /tmp/ +wget -q https://github.com/boostorg/boost/releases/download/boost-1.87.0/boost-1.87.0-cmake.tar.gz +tar -xzf boost-1.87.0-cmake.tar.gz +cd boost-1.87.0 +./bootstrap.sh +./b2 --clean +./b2 install --with-python --prefix=/usr +cd $PREVPWD diff --git a/build-deps.sh b/build-deps.sh new file mode 100755 index 0000000..2f2381a --- /dev/null +++ b/build-deps.sh @@ -0,0 +1,80 @@ +#!/bin/sh + +# Builds dependencies (latest stable releases) from source +# Used for building wheels. Invoke via 'make wheels' rather +# than directly! + +set -e + +. /etc/os-release +echo "OS: $ID">&2 +echo "VERSION: $VERSION_ID">&2 + +get_latest_version() { + #Finds the latest git tag or falls back to returning the git default branch (usually master or main) + #Assumes some kind of semantic versioning (possibly with a v prefix) + TAG=$(git tag -l | grep -E "^v?[0-9]+(\.[0-9])*" | sort -t. -k 1.2,1n -k 2,2n -k 3,3n -k 4,4n | tail -n 1) + if [ -z "$TAG" ]; then + echo "No releases found, falling back to default git branch!">&2 + #output the git default branch for the repository in the current working dir (usually master or main) + git symbolic-ref refs/remotes/origin/HEAD | sed 's@^refs/remotes/origin/@@' + else + echo "$TAG" + fi +} + +[ -z "$PREFIX" ] && PREFIX="/usr/local/" +if [ "$ID" = "almalinux" ] || [ "$ID" = "centos" ] || [ "$ID" = "rhel" ]; then + if [ -d /usr/local/share/aclocal ]; then + #needed for manylinux_2_28 container which ships custom autoconf, possibly others too? + export ACLOCAL_PATH=/usr/share/aclocal + fi + case $VERSION_ID in + 7*) + if [ -d /opt/rh/devtoolset-10/root/usr/lib ]; then + #we are running in the manylinux2014 image + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/opt/rh/devtoolset-10/root/usr/lib + #libxml2 is out of date, compile and install a new one + yum install -y xz + wget https://download.gnome.org/sources/libxml2/2.9/libxml2-2.9.14.tar.xz + unxz libxml2-2.9.14.tar.xz + tar -xf libxml2-2.9.14.tar + cd libxml2-2.9.14 && ./configure --prefix=$PREFIX --without-python && make && make install + cd .. + fi + ;; + esac +fi + +PWD="$(pwd)" +BUILDDIR="$(mktemp -dt "build-deps.XXXXXX")" +cd "$BUILDDIR" +for PACKAGE in LanguageMachines/ticcutils LanguageMachines/timbl; do + echo "Git cloning $PACKAGE ">&2 + git clone https://github.com/$PACKAGE + PACKAGE="$(basename $PACKAGE)" + cd "$PACKAGE" + if [ "$1" != "--devel" ]; then + VERSION="$(get_latest_version)" + if [ "$VERSION" != "master" ] && [ "$VERSION" != "main" ] && [ "$VERSION" != "devel" ]; then + echo "Checking out latest stable version: $VERSION">&2 + git -c advice.detachedHead=false checkout "$VERSION" + fi + fi + echo "Bootstrapping $PACKAGE ">&2 + if [ ! -f configure ] && [ -f configure.ac ]; then + #shellcheck disable=SC2086 + autoreconf --install --verbose + fi + echo "Configuring $PACKAGE" >&2 + ./configure --prefix="$PREFIX" >&2 + echo "Make $PACKAGE" >&2 + make + echo "Make install $PACKAGE" >&2 + make install + cd .. +done +cd $PWD +[ -n "$BUILDDIR" ] && rm -Rf "$BUILDDIR" + +echo "Dependencies installed" >&2 diff --git a/example.py b/example.py index edb9a0b..2bc6a0c 100755 --- a/example.py +++ b/example.py @@ -1,19 +1,16 @@ #! /usr/bin/env python # -*- coding: utf8 -*- - -from __future__ import print_function, unicode_literals, division, absolute_import #Make Python 2.x act as much like Python 3 as possible - import timbl import os -#We are building a very simple context-aware translator Word Sense Disambiguator for the word "bank", based on the occurrence of some keywords in the same sentence: +#We are building a very simple context-aware translator Word Sense Disambiguator for the word "bank", based on the occurrence of some keywords in the same sentence: -# The features are binary and represent presence or absence of certain keywords. We choose: +# The features are binary and represent presence or absence of certain keywords. We choose: # - money # - sit # - river -#They have a value of 0 or 1 (but note that Timbl support string features just as well!) +#They have a value of 0 or 1 (but note that Timbl support string features just as well!) #The classes we predict are: # - financial @@ -35,7 +32,7 @@ #We start anew and load the classifier again (of course we could have just skipped this and the save step and continued immediately) classifier = timbl.TimblClassifier("wsd-bank", "-a 0 -k 1" ) #wsd-bank will be the prefix of any files written for timbl -classifier.load() #even if this is omitted it will still work, the first classify() call will invoke load() +classifier.load() #even if this is omitted it will still work, the first classify() call will invoke load() #Let's classify an instance: classlabel, distribution, distance = classifier.classify( (1,0,0) ) @@ -56,16 +53,18 @@ os.unlink("testfile") -classifier = timbl.TimblClassifier("wsd-bank", "-a 0 -k 1" ) + +classifier = timbl.TimblClassifier("wsd-bank", "-a 0 -k 1 +v n+di+k" ) #add some extra verbosity flags classifier.load() classifier.addinstance("testfile", (1,0,0),'financial' ) #addinstance can be used to add instances to external files (use append() for training) classifier.addinstance("testfile", (0,1,0),'furniture' ) classifier.addinstance("testfile", (0,0,1),'geograpic' ) -classifier.addinstance("testfile", (1,1,0),'geograpic' ) #this one will be wrongly classified as financial & furniture +classifier.addinstance("testfile", (1,1,0),'geograpic' ) #this one will be wrongly classified as financial & furniture classifier.test("testfile") print("Accuracy: ", classifier.getAccuracy()) +print("Best neighbours: ", classifier.bestNeighbours()) #this only works with the extra verbosity flags and only if python-timbl is compiled with gcc diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b24295f --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[build_ext] +force=1 diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..687de05 --- /dev/null +++ b/setup.py @@ -0,0 +1,194 @@ +#!/bin/env python3 + +import sys +import os +import shutil +import platform +import glob + +from distutils.core import setup, Extension +from distutils.command.build_ext import build_ext +from distutils.dep_util import newer +from distutils.unixccompiler import UnixCCompiler + + +def updateDocHeader(input, output): + docstrings = {} + exec(compile(open(input, "rb").read(), input, 'exec'), docstrings) + + stream = open(output, "w") + print("#ifndef TIMBL_DOC_H",file=stream) + print("#define TIMBL_DOC_H\n",file=stream) + print("#include \n",file=stream) + + for var in filter(lambda v: v.endswith("_DOC"), docstrings): + print("PyDoc_STRVAR(%s, \"%s\");\n" % (var, str(docstrings[var].strip().encode("unicode_escape"), 'ascii') ), file=stream) + + print("#endif", file=stream) + + stream.close() + +includedirs = [] +libdirs = [] +print(f"system={platform.system()} machine={platform.machine()}", file=sys.stderr) +if platform.system() == "Darwin": + #we are running on Mac OS X (with homebrew hopefully), stuff is in specific locations: + if platform.machine().lower() == "arm64": + print("(macos arm64 detected)", file=sys.stderr) + libdirs.append("/opt/homebrew/lib") + includedirs.append("/opt/homebrew/include") + libdirs.append("/opt/homebrew/icu4c/lib") + includedirs.append("/opt/homebrew/icu4c/include") + libdirs.append("/opt/homebrew/libxml2/lib") + includedirs.append("/opt/homebrew/libxml2/include") + includedirs.append("/opt/homebrew/libxml2/include/libxml2") + libdirs.append("/opt/homebrew/opt/icu4c/lib") + includedirs.append("/opt/homebrew/opt/icu4c/include") + libdirs.append("/opt/homebrew/opt/libxml2/lib") + includedirs.append("/opt/homebrew/opt/libxml2/include") + libdirs.append("/opt/homebrew/opt/boost-python3/lib") + libdirs.append("/opt/homebrew/opt/boost/lib") + includedirs.append("/opt/homebrew/opt/boost/include") + else: + #we are running on Mac OS X with homebrew, stuff is in specific locations: + libdirs.append("/usr/local/opt/icu4c/lib") + includedirs.append("/usr/local/opt/icu4c/include") + libdirs.append("/usr/local/opt/libxml2/lib") + includedirs.append("/usr/local/opt/libxml2/include") + includedirs.append("/usr/local/opt/libxml2/include/libxml2") + libdirs.append("/usr/local/opt/boost-python3/lib") + includedirs.append("/usr/local/opt/boost-python3/lib") + libdirs.append("/usr/local/opt/boost/lib") + includedirs.append("/usr/local/opt/boost/include") + +#add some common default paths +includedirs += ['/usr/include/', '/usr/include/libxml2','/usr/local/include/' ] +libdirs += ['/usr/lib','/usr/local/lib'] +if 'VIRTUAL_ENV' in os.environ: + includedirs.insert(0,os.environ['VIRTUAL_ENV'] + '/include') + libdirs.insert(0,os.environ['VIRTUAL_ENV'] + '/lib') +if 'INCLUDE_DIRS' in os.environ: + includedirs = list(os.environ['INCLUDE_DIRS'].split(':')) + includedirs +if 'LIBRARY_DIRS' in os.environ: + libdirs = list(os.environ['LIBRARY_DIRS'].split(':')) + libdirs + +if platform.system() == "Darwin": + extra_options = ["--stdlib=libc++",'-D U_USING_ICU_NAMESPACE=1'] +else: + extra_options = ['-D U_USING_ICU_NAMESPACE=1'] + +print(f"include_dirs={' '.join(includedirs)} library_dirs={' '.join(libdirs)} extra_options={' '.join(extra_options)}", file=sys.stderr) + +class BuildExt(build_ext): + def initialize_options(self): + build_ext.initialize_options(self) + pyversion = sys.version.split(" ")[0] + pyversion = pyversion.split(".")[0] + pyversion.split(".")[1] #returns something like 312 for 3.12 + #Find boost + self.findboost(libdirs, includedirs, pyversion) + + def findboost(self, libsearch, includesearch, pyversion): + self.boost_library_dir = None + self.boost_include_dir = None + self.boostlib = "boost_python" + if os.path.exists('/usr/local/opt/boost-python3'): + #Mac OS X with homebrew + self.boostlib = "boost_python3" + libsearch.insert(0,'/usr/local/opt/boost-python3/lib') + libsearch.insert(0,'/usr/local/opt/boost/lib') + includesearch.insert(0,'/usr/local/opt/boost/include') + if os.path.exists('/opt/homebrew/opt/boost-python3'): + self.boostlib = "boost_python3" + libsearch.insert(0,'/opt/homebrew/opt/boost-python3/lib') + libsearch.insert(0,'/opt/homebrew/opt/boost/lib') + includesearch.insert(0,'/opt/homebrew/opt/boost/include') + if os.path.exists('/opt/homebrew/opt/boost-python' + pyversion): + self.boostlib = "boost_python" + pyversion + libsearch.insert(0,f"/opt/homebrew/opt/boost-python{pyversion}/lib") + libsearch.insert(0,'/opt/homebrew/opt/boost/lib') + includesearch.insert(0,'/opt/homebrew/opt/boost/include') + + for d in libsearch: + if os.path.exists(d + "/libboost_python-py"+pyversion+".so"): + self.boost_library_dir = d + self.boostlib = "boost_python-py" + pyversion + break + elif os.path.exists(d + "/libboost_python"+pyversion+".so"): + self.boost_library_dir = d + self.boostlib = "boost_python" + pyversion + break + elif os.path.exists(d + "/libboost_python3.so"): + self.boost_library_dir = d + self.boostlib = "boost_python3" + break + elif os.path.exists(d + "/libboost_python.so"): + #probably goes wrong if this is for python 2! + self.boost_library_dir = d + self.boostlib = "boost_python" + break + elif os.path.exists(d + "/libboost_python-py" + pyversion + ".dylib"): #Mac OS X + self.boost_library_dir = d + self.boostlib = "boost_python-py" + pyversion + break + elif os.path.exists(d + "/libboost_python" + pyversion + ".dylib"): #Mac OS X + self.boost_library_dir = d + self.boostlib = "boost_python" + pyversion + break + elif os.path.exists(d + "/libboost_python3.dylib"): #Mac OS X + self.boost_library_dir = d + self.boostlib = "boost_python3" + break + elif os.path.exists(d + "/libboost_python.dylib"): #Mac OS X + self.boost_library_dir = d + #probably goes wrong if this is for python 2! + self.boostlib = "boost_python" + break + for d in includesearch: + if os.path.exists(d + "/boost"): + self.boost_include_dir = d + break + + if self.boost_library_dir is not None: + print("Detected boost library in " + self.boost_library_dir + " (" + self.boostlib +")",file=sys.stderr) + else: + print("Unable to find boost library directory automatically. Is libboost-python3 installed?",file=sys.stderr) + self.boost_library_dir = libsearch[0] + if self.boost_include_dir is not None: + print("Detected boost headers in " + self.boost_include_dir ,file=sys.stderr) + else: + print("Unable to find boost headers automatically. Is libboost-python-dev installed?",file=sys.stderr) + self.boost_include_dir = includesearch[0] + + def build_extensions(self): + if newer("src/docstrings.h.in", "src/docstrings.h"): + updateDocHeader("src/docstrings.h.in", "src/docstrings.h") + + for ext in self.extensions: + ext.include_dirs += includedirs + ext.library_dirs += libdirs + + compile_args = ["-std=c++17"] + if platform.system() == "Darwin": + compile_args.append("-stdlib=libc++") + ext.extra_compile_args.extend(compile_args) + ext.libraries.append(self.boostlib) + + build_ext.build_extensions(self) + + +timblModule = Extension("timblapi", ["src/timblapi.cc"], + libraries=["timbl"], + depends=["src/timblapi.h", "src/docstrings.h"]) + +setup( + name="python3-timbl", + version="2025.05.02", + description="Python 3 language binding for the Tilburg Memory-Based Learner", + author="Sander Canisius, Maarten van Gompel", + author_email="S.V.M.Canisius@uvt.nl, proycon@anaproy.nl", + url="http://github.com/proycon/python-timbl", + classifiers=["Development Status :: 4 - Beta","Topic :: Text Processing :: Linguistic","Topic :: Scientific/Engineering","Programming Language :: Python :: 3","Operating System :: POSIX","Intended Audience :: Developers","Intended Audience :: Science/Research","License :: OSI Approved :: GNU General Public License v3 (GPLv3)"], + license="GPL", + py_modules=['timbl'], + ext_modules=[timblModule], + cmdclass={"build_ext": BuildExt}) diff --git a/setup2.py b/setup2.py deleted file mode 100755 index 2065094..0000000 --- a/setup2.py +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/python -import os -import shutil -if os.path.exists('setup2.py'): - shutil.copyfile("setup2.py","setup.py") - -from itertools import ifilter - -from distutils.core import setup, Extension -from distutils.command.build_ext import build_ext -from distutils.dep_util import newer -from distutils.unixccompiler import UnixCCompiler - - -def updateDocHeader(input, output): - docstrings = {} - execfile(input, docstrings) - - stream = open(output, "w") - print >> stream, "#ifndef TIMBL_DOC_H" - print >> stream, "#define TIMBL_DOC_H\n" - print >> stream, "#include \n" - - for var in ifilter(lambda v: v.endswith("_DOC"), docstrings): - print >> stream, "PyDoc_STRVAR(%s, \"%s\");\n" % ( - var, docstrings[var].strip().encode("string_escape")) - - print >> stream, "#endif" - - stream.close() - - -class BuildExt(build_ext): - - user_options = build_ext.user_options + [ - ("boost-include-dir=", None, "directory for boost header files"), - ("boost-library-dir=", None, "directory for boost library files"), - ("timbl-include-dir=", None, "directory for TiMBL files"), - ("timbl-library-dir=", None, "directory for TiMBL library files"), - ("libxml2-include-dir=", None, "directory for LibXML2 files"), - ("libxml2-library-dir=", None, "directory for LibXML2 library files"), - ("static-boost-python", "s", "statically link boost-python")] - - boolean_options = build_ext.boolean_options + [ - "static-boost-python"] - - def initialize_options(self): - build_ext.initialize_options(self) - self.boost_include_dir = "/usr/include" - self.boost_library_dir = "/usr/lib" - self.libxml2_include_dir = "/usr/include/libxml2" - self.libxml2_library_dir = "/usr/lib" - if 'VIRTUAL_ENV' in os.environ and os.path.exists(os.environ['VIRTUAL_ENV'] + '/include/timbl'): - self.timbl_include_dir = os.environ['VIRTUAL_ENV'] + '/include' - self.timbl_library_dir = os.environ['VIRTUAL_ENV'] + '/lib' - elif os.path.exists("/usr/include/timbl"): - self.timbl_include_dir = "/usr/include" - self.timbl_library_dir = "/usr/lib" - elif os.path.exists("/usr/local/include/timbl"): - self.timbl_include_dir = "/usr/local/include" - self.timbl_library_dir = "/usr/local/lib" - else: - raise Exception("Timbl not found, make sure to install Timbl and set --timbl-include-dir and --timbl-library-dir appropriately...") - self.static_boost_python = False - - def finalize_options(self): - build_ext.finalize_options(self) - self.ensure_file_exists("boost_include_dir", "boost/python.hpp") - self.ensure_dirname("boost_library_dir") - self.ensure_file_exists("timbl_include_dir", "timbl/TimblAPI.h") - self.ensure_dirname("timbl_library_dir") - self.ensure_file_exists("libxml2_include_dir", "libxml/tree.h") - self.ensure_dirname("libxml2_library_dir") - - def ensure_file_exists(self, option, filename): - self.ensure_dirname(option) - self._ensure_tested_string( - option, - lambda d: os.path.isfile(os.path.join(d, filename)), - "directory name", - "'%s' was not found in '%%s'" % filename) - - def build_extensions(self): - if newer("src/docstrings.h.in", "src/docstrings.h"): - updateDocHeader("src/docstrings.h.in", "src/docstrings.h") - - for ext in self.extensions: - ext.include_dirs.append(self.boost_include_dir) - ext.include_dirs.append(self.timbl_include_dir) - ext.include_dirs.append(self.libxml2_include_dir) - ext.library_dirs.append(self.timbl_library_dir) - ext.library_dirs.append(self.boost_library_dir) - ext.library_dirs.append(self.libxml2_library_dir) - - pyversion = sys.version[0:3][0] + sys.version[0:3][2] #returns something like 27 - if os.path.exists(self.boost_library_dir + "/libboost_python-py"+pyversion+".so"): - boostlib = "boost_python-py" + pyversion - elif os.path.exists(self.boost_library_dir + "/libboost_python2.so"): - boostlib = "boost_python2" - elif os.path.exists(self.boost_library_dir + "/libboost_python.so"): - #probably goes wrong if this is for python 3! - boostlib = "boost_python" - else: - print >>sys.stderr, "Unable to find boost library" - sys.exit(65) - - if isinstance(self.compiler, UnixCCompiler) and self.static_boost_python: - ext.extra_link_args.extend( - "-Wl,-Bstatic -l" + boostlib + " -Wl,-Bdynamic".split()) - else: - ext.libraries.append(boostlib) - if isinstance(self.compiler, UnixCCompiler) and \ - self.static_boost_python: - ext.extra_link_args.extend( - "-Wl,-Bstatic -lboost_python -Wl,-Bdynamic".split()) - else: - ext.libraries.append("boost_python") - - build_ext.build_extensions(self) - - -timblModule = Extension("timblapi", ["src/timblapi.cc"], - libraries=["timbl"], - depends=["src/timblapi.h", "src/docstrings.h"]) - - -setup( - name="python-timbl", - version="2015.09.05", - description="Python language binding for the Tilburg Memory-Based Learner", - author="Sander Canisius, Maarten van Gompel", - author_email="S.V.M.Canisius@uvt.nl, proycon@anaproy.nl", - url="http://github.com/proycon/python-timbl", - license="GPL", - classifiers=["Development Status :: 4 - Beta","Topic :: Text Processing :: Linguistic","Topic :: Scientific/Engineering","Programming Language :: Python :: 2.6","Programming Language :: Python :: 2.7","Operating System :: POSIX","Intended Audience :: Developers","Intended Audience :: Science/Research","License :: OSI Approved :: GNU General Public License v3 (GPLv3)"], - py_modules=['timbl'], - ext_modules=[timblModule], - cmdclass={"build_ext": BuildExt}) diff --git a/setup3.py b/setup3.py deleted file mode 100755 index e9664f8..0000000 --- a/setup3.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/python3 - -import sys -import os -import shutil -if os.path.exists('setup3.py'): - shutil.copyfile("setup3.py","setup.py") - -from distutils.core import setup, Extension -from distutils.command.build_ext import build_ext -from distutils.dep_util import newer -from distutils.unixccompiler import UnixCCompiler - - -def updateDocHeader(input, output): - docstrings = {} - exec(compile(open(input, "rb").read(), input, 'exec'), docstrings) - - stream = open(output, "w") - print("#ifndef TIMBL_DOC_H",file=stream) - print("#define TIMBL_DOC_H\n",file=stream) - print("#include \n",file=stream) - - for var in filter(lambda v: v.endswith("_DOC"), docstrings): - print("PyDoc_STRVAR(%s, \"%s\");\n" % (var, str(docstrings[var].strip().encode("unicode_escape"), 'ascii') ), file=stream) - - print("#endif", file=stream) - - stream.close() - - -class BuildExt(build_ext): - - user_options = build_ext.user_options + [ - ("boost-include-dir=", None, "directory for boost header files"), - ("boost-library-dir=", None, "directory for boost library files"), - ("timbl-include-dir=", None, "directory for TiMBL files"), - ("timbl-library-dir=", None, "directory for TiMBL library files"), - ("libxml2-include-dir=", None, "directory for LibXML2 files"), - ("libxml2-library-dir=", None, "directory for LibXML2 library files"), - ("static-boost-python3", "s", "statically link boost-python")] - - boolean_options = build_ext.boolean_options + [ - "static-boost-python3"] - - def initialize_options(self): - build_ext.initialize_options(self) - self.boost_include_dir = "/usr/include" - self.boost_library_dir = "/usr/lib" - self.libxml2_include_dir = "/usr/include/libxml2" - self.libxml2_library_dir = "/usr/lib" - if 'VIRTUAL_ENV' in os.environ and os.path.exists(os.environ['VIRTUAL_ENV'] + '/include/timbl'): - self.timbl_include_dir = os.environ['VIRTUAL_ENV'] + '/include' - self.timbl_library_dir = os.environ['VIRTUAL_ENV'] + '/lib' - elif os.path.exists("/usr/include/timbl"): - self.timbl_include_dir = "/usr/include" - self.timbl_library_dir = "/usr/lib" - elif os.path.exists("/usr/local/include/timbl"): - self.timbl_include_dir = "/usr/local/include" - self.timbl_library_dir = "/usr/local/lib" - else: - raise Exception("Timbl not found, make sure to install Timbl and set --timbl-include-dir and --timbl-library-dir appropriately...") - self.static_boost_python = False - - def finalize_options(self): - build_ext.finalize_options(self) - self.ensure_file_exists("boost_include_dir", "boost/python.hpp") - self.ensure_dirname("boost_library_dir") - self.ensure_file_exists("timbl_include_dir", "timbl/TimblAPI.h") - self.ensure_dirname("timbl_library_dir") - self.ensure_file_exists("libxml2_include_dir", "libxml/tree.h") - self.ensure_dirname("libxml2_library_dir") - - def ensure_file_exists(self, option, filename): - self.ensure_dirname(option) - self._ensure_tested_string( - option, - lambda d: os.path.isfile(os.path.join(d, filename)), - "directory name", - "'%s' was not found in '%%s'" % filename) - - def build_extensions(self): - if newer("src/docstrings.h.in", "src/docstrings.h"): - updateDocHeader("src/docstrings.h.in", "src/docstrings.h") - - for ext in self.extensions: - ext.include_dirs.append(self.boost_include_dir) - ext.include_dirs.append(self.timbl_include_dir) - ext.include_dirs.append(self.libxml2_include_dir) - ext.library_dirs.append(self.timbl_library_dir) - ext.library_dirs.append(self.boost_library_dir) - ext.library_dirs.append(self.libxml2_library_dir) - - pyversion = sys.version[0:3][0] + sys.version[0:3][2] #returns something like 32 - if os.path.exists(self.boost_library_dir + "/libboost_python-py"+pyversion+".so"): - boostlib = "boost_python-py" + pyversion - elif os.path.exists(self.boost_library_dir + "/libboost_python3.so"): - boostlib = "boost_python3" - elif os.path.exists(self.boost_library_dir + "/libboost_python.so"): - #probably goes wrong if this is for python 2! - boostlib = "boost_python" - elif os.path.exists(self.boost_library_dir + "/libboost_python3.dylib"): #Mac OS X - boostlib = "boost_python3" - elif os.path.exists(self.boost_library_dir + "/libboost_python.dylib"): #Mac OS X - #probably goes wrong if this is for python 2! - boostlib = "boost_python" - else: - print("Unable to find boost library",file=sys.stderr) - sys.exit(65) - - if isinstance(self.compiler, UnixCCompiler) and self.static_boost_python: - ext.extra_link_args.extend( - "-Wl,-Bstatic -l" + boostlib + " -Wl,-Bdynamic".split()) - else: - ext.libraries.append(boostlib) - - build_ext.build_extensions(self) - - -timblModule = Extension("timblapi", ["src/timblapi.cc"], - libraries=["timbl"], - depends=["src/timblapi.h", "src/docstrings.h"]) - - -setup( - name="python3-timbl", - version="2015.09.05", - description="Python 3 language binding for the Tilburg Memory-Based Learner", - author="Sander Canisius, Maarten van Gompel", - author_email="S.V.M.Canisius@uvt.nl, proycon@anaproy.nl", - url="http://github.com/proycon/python-timbl", - classifiers=["Development Status :: 4 - Beta","Topic :: Text Processing :: Linguistic","Topic :: Scientific/Engineering","Programming Language :: Python :: 3","Operating System :: POSIX","Intended Audience :: Developers","Intended Audience :: Science/Research","License :: OSI Approved :: GNU General Public License v3 (GPLv3)"], - license="GPL", - py_modules=['timbl'], - ext_modules=[timblModule], - cmdclass={"build_ext": BuildExt}) diff --git a/src/docstrings.h.in b/src/docstrings.h.in index db78212..87b9ef6 100755 --- a/src/docstrings.h.in +++ b/src/docstrings.h.in @@ -287,8 +287,8 @@ of the nearest neighbour for a given test instance. must match, zero (the default) corresponds to a top level distribution, higher values will result in no distribution being returned if the required depth is not reached, this improves performance. - - + + :return: (boolean signalling success or failure, the predicted class, class distribution, distance of the nearest neighbour) @@ -463,21 +463,6 @@ implementation """ -STARTSERVER_DOC = """ -self.startServer(port, maxConnections) - -Start a TiMBL server. - -:Parameters: - `port` : int - the TCP port on which to listen for connections - - `maxConnections` : int - the maximum number of simultaneous connections - -:return: boolean signalling success or failure -:rtype: bool -""" CURRENTWEIGHTING_DOC = """ diff --git a/src/timblapi.cc b/src/timblapi.cc index 4f55c35..b3a0f04 100755 --- a/src/timblapi.cc +++ b/src/timblapi.cc @@ -2,27 +2,27 @@ * Copyright (C) 2006-2015 Sander Canisius, Maarten van Gompel * * This file is part of python-timbl. - * + * * python-timbl is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. - * + * * python-timbl is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with python-timbl; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA - * + * * Linking python-timbl statically or dynamically with other modules * is making a combined work based on python-timbl. Thus, the terms * and conditions of the GNU General Public License cover the whole * combination. - * + * * In addition, as a special exception, the copyright holder of * python-timbl gives you permission to combine python-timbl with free * software programs or libraries that are released under the GNU LGPL @@ -33,7 +33,7 @@ * code concerned, provided that you include the source code of that * other code when and as the GNU GPL requires distribution of source * code. - * + * * Note that people who make modified versions of python-timbl are not * obligated to grant this special exception for their modified * versions; it is their choice whether to do so. The GNU General @@ -49,11 +49,13 @@ #include "timbl/Instance.h" #include "docstrings.h" +#include #include #include #include #include +#include #ifndef __clang__ #include @@ -66,10 +68,10 @@ using namespace boost::python; tuple TimblApiWrapper::classify(const std::string& line) -{ +{ std::string cls; bool result = Classify(line, cls); - return make_tuple(result, cls); + return boost::python::make_tuple(result, cls); } @@ -78,7 +80,7 @@ tuple TimblApiWrapper::classify2(const std::string& line) std::string cls; double distance; bool result = Classify(line, cls, distance); - return make_tuple(result, cls, distance); + return boost::python::make_tuple(result, cls, distance); } @@ -86,17 +88,17 @@ tuple TimblApiWrapper::classify3(const std::string& line, bool normalize, const { std::string cls; double distance; - const Timbl::ValueDistribution * distrib; + const Timbl::ClassDistribution * distrib; const Timbl::TargetValue * result = Classify(line, distrib , distance); if (result != NULL) { if ((requireddepth > 0) && (matchDepth() < requireddepth)) { - return make_tuple(true, "", python::dict(), 999999); + return boost::python::make_tuple(true, "", python::dict(), 999999); } else { const std::string cls = result->Name(); - return make_tuple(true, cls, dist2dict(distrib, normalize), distance); + return boost::python::make_tuple(true, cls, dist2dict(distrib, normalize), distance); } } else { - return make_tuple(false,"",python::dict(),999999); + return boost::python::make_tuple(false,"",python::dict(),999999); } } @@ -137,31 +139,32 @@ tuple TimblApiWrapper::classify3safe(const std::string& line, bool normalize,con { runningthreads++; PyThreadState * m_thread_state = PyEval_SaveThread(); //release GIL - + Timbl::TimblExperiment * clonedexp = getexperimentforthread(); - const Timbl::ValueDistribution * distrib; + const Timbl::ClassDistribution * distrib; double distance; - const Timbl::TargetValue * result = clonedexp->Classify(line, distrib,distance); + const auto line_unicode = TiCC::toUnicodeString(line); + const Timbl::TargetValue * result = clonedexp->Classify(line_unicode, distrib,distance); if (result != NULL) { if ((requireddepth > 0) && (clonedexp->matchDepth() < requireddepth)) { PyEval_RestoreThread(m_thread_state); m_thread_state = NULL; runningthreads--; - return make_tuple(true, "", python::dict(), 999999); + return boost::python::make_tuple(true, "", python::dict(), 999999); } else { const std::string cls = result->Name(); //const std::string diststring = distrib->DistToString(); PyEval_RestoreThread(m_thread_state); m_thread_state = NULL; runningthreads--; - return make_tuple(true, cls, dist2dict(distrib, normalize), distance); + return boost::python::make_tuple(true, cls, dist2dict(distrib, normalize), distance); } } else { PyEval_RestoreThread(m_thread_state); - m_thread_state = NULL; + m_thread_state = NULL; runningthreads--; - return make_tuple(false,"",python::dict(),999999); + return boost::python::make_tuple(false,"",python::dict(),999999); } } @@ -177,7 +180,7 @@ bool TimblApiWrapper::showBestNeighbours(object& stream) { #ifdef __clang__ std::cerr << "showBestNeighbours is not implemented for clang" << std::endl; - return false; + return false; #else int fd = extract(stream.attr("fileno")()); __gnu_cxx::stdio_filebuf fdbuf(dup(fd), std::ios::out); @@ -199,7 +202,7 @@ bool TimblApiWrapper::showOptions(object& stream) { #ifdef __clang__ std::cerr << "showOptions is not implemented for clang" << std::endl; - return false; + return false; #else int fd = extract(stream.attr("fileno")()); __gnu_cxx::stdio_filebuf fdbuf(dup(fd), std::ios::out); @@ -222,12 +225,12 @@ void TimblApiWrapper::initthreading() { detachedexp = grabAndDisconnectExp(); } - + bool TimblApiWrapper::showSettings(object& stream) { #ifdef __clang__ std::cerr << "showSettings is not implemented for clang" << std::endl; - return false; + return false; #else int fd = extract(stream.attr("fileno")()); __gnu_cxx::stdio_filebuf fdbuf(dup(fd), std::ios::out); @@ -237,34 +240,24 @@ bool TimblApiWrapper::showSettings(object& stream) } -python::dict TimblApiWrapper::dist2dict(const Timbl::ValueDistribution * distribution, bool normalize, double minf) const { +python::dict TimblApiWrapper::dist2dict(const Timbl::ClassDistribution * distribution, bool normalize, double minf) const { python::dict result; - size_t freq; - - double maxfreq = 0; - + double freq; + double sum = 0.0; if (normalize) { - Timbl::ValueDistribution::VDlist::const_iterator it = distribution->begin(); - while ( it != distribution->end() ){ - Timbl::Vfield *f = it->second; - if (f->Freq() > maxfreq) maxfreq = f->Freq(); - ++it; + for (Timbl::ClassDistribution::VDlist::const_iterator it = distribution->begin(); it != distribution->end(); it++) { + sum += it->second->Weight(); } } - - Timbl::ValueDistribution::VDlist::const_iterator it = distribution->begin(); - while ( it != distribution->end() ){ - Timbl::Vfield *f = it->second; + for (Timbl::ClassDistribution::VDlist::const_iterator it = distribution->begin(); it != distribution->end(); it++) { if (normalize) { - freq = f->Freq() / maxfreq; - } else { - freq = f->Freq(); + it->second->SetWeight(it->second->Weight() / sum); } + freq = it->second->Weight(); if ( freq >= minf ){ - result[f->Value()->Name()] = freq; + result[it->second->Value()->Name()] = freq; } - ++it; } return result; @@ -315,7 +308,7 @@ BOOST_PYTHON_MODULE(timblapi) .def("saveWeights", &TimblApiWrapper::SaveWeights, SAVEWEIGHTS_DOC) .def("getWeights", &TimblApiWrapper::GetWeights, GETWEIGHTS_DOC) - + .def("getAccuracy", &TimblApiWrapper::GetAccuracy, GETACCURACY_DOC) .def("writeArrays", &TimblApiWrapper::WriteArrays, WRITEARRAYS_DOC) @@ -346,7 +339,6 @@ BOOST_PYTHON_MODULE(timblapi) .def("expName", &TimblApiWrapper::ExpName, EXPNAME_DOC) .def("versionInfo", &TimblApiWrapper::VersionInfo, VERSIONINFO_DOC) .staticmethod("versionInfo") - .def("startServer", &TimblApiWrapper::StartServer, STARTSERVER_DOC) .def("currentWeighting", &TimblApiWrapper::CurrentWeighting, CURRENTWEIGHTING_DOC) .def("valid", &TimblApiWrapper::Valid) @@ -382,7 +374,7 @@ BOOST_PYTHON_MODULE(timblapi) .value("X2", Timbl::X2) .value("SV", Timbl::SV) ; - + //def("to_string", to_string); } diff --git a/src/timblapi.h b/src/timblapi.h index cfdf824..6a28b4f 100644 --- a/src/timblapi.h +++ b/src/timblapi.h @@ -2,27 +2,27 @@ * Copyright (C) 2006 Sander Canisius * * This file is part of python-timbl. - * + * * python-timbl is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. - * + * * python-timbl is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with python-timbl; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA - * + * * Linking python-timbl statically or dynamically with other modules * is making a combined work based on python-timbl. Thus, the terms * and conditions of the GNU General Public License cover the whole * combination. - * + * * In addition, as a special exception, the copyright holder of * python-timbl gives you permission to combine python-timbl with free * software programs or libraries that are released under the GNU LGPL @@ -33,7 +33,7 @@ * code concerned, provided that you include the source code of that * other code when and as the GNU GPL requires distribution of source * code. - * + * * Note that people who make modified versions of python-timbl are not * obligated to grant this special exception for their modified * versions; it is their choice whether to do so. The GNU General @@ -64,23 +64,28 @@ class TimblApiWrapper : public Timbl::TimblAPI { private: std::vector > experimentpool; Timbl::TimblExperiment * detachedexp; - python::dict dist2dict(const Timbl::ValueDistribution * dist, bool=true,double=0) const; + python::dict dist2dict(const Timbl::ClassDistribution * dist, bool=true,double=0) const; pthread_mutex_t lock; //global lock bool debug; int runningthreads; public: - TimblApiWrapper(const std::string& args, const std::string& name="") : Timbl::TimblAPI(args, name) { detachedexp = NULL; debug = false; runningthreads = 0; lock = PTHREAD_MUTEX_INITIALIZER;} - ~TimblApiWrapper() { + TimblApiWrapper(const std::string& args, const std::string& name="") : Timbl::TimblAPI(args, name) { + detachedexp = NULL; + debug = false; + runningthreads = 0; + pthread_mutex_init(&lock, NULL); + } + ~TimblApiWrapper() { if (debug) std::cerr << "TimblApiWrapper Destructor" << std::endl; if (runningthreads == 0) { - if (detachedexp != NULL) delete detachedexp; + if (detachedexp != NULL) delete detachedexp; for (std::vector >::iterator iter = experimentpool.begin(); iter != experimentpool.end(); iter++) { delete iter->second; } } } - + void initthreading(); void enableDebug() { debug = true; }; diff --git a/timbl.py b/timbl.py index fea88ae..5623d94 100644 --- a/timbl.py +++ b/timbl.py @@ -13,61 +13,44 @@ from __future__ import absolute_import import sys -if sys.version < '3': - from codecs import getwriter - stderr = getwriter('utf-8')(sys.stderr) - stdout = getwriter('utf-8')(sys.stdout) -else: - stderr = sys.stderr - stdout = sys.stdout - +from tempfile import mktemp import timblapi import io import os +stderr = sys.stderr +stdout = sys.stdout + + class LoadException(Exception): pass class ClassifyException(Exception): pass -def b(s): - """Conversion to bytes""" - if sys.version < '3': - if isinstance(s, unicode): #pylint: disable=undefined-variable - return s.encode('utf-8') - else: - return s - #else: - # if isinstance(s, str): - # return s.encode('utf-8') def u(s, encoding = 'utf-8', errors='strict'): - #ensure s is properly unicode.. wrapper for python 2.6/2.7, - if sys.version < '3': - #ensure the object is unicode - if isinstance(s, unicode): #pylint: disable=undefined-variable - return s - else: - return unicode(s, encoding,errors=errors) #pylint: disable=undefined-variable + #will work on byte arrays + if isinstance(s, str): + return s else: - #will work on byte arrays - if isinstance(s, str): - return s - else: - return str(s,encoding,errors=errors) + return str(s,encoding,errors=errors) class TimblClassifier(object): - def __init__(self, fileprefix, timbloptions, format = "Tabbed", dist=True, encoding = 'utf-8', overwrite = True, flushthreshold=10000, threading=False, normalize=True, debug=False): + def __init__(self, fileprefix, timbloptions, format = "Tabbed", dist=True, encoding = 'utf-8', overwrite = True, flushthreshold=10000, threading=False, normalize=True, debug=False, sklearn=False, flushdir=None): if format.lower() == "tabbed": self.format = "Tabbed" self.delimiter = "\t" elif format.lower() == "columns": self.format = "Columns" self.delimiter = " " + elif format.lower() == 'sparse': # for sparse arrays, e.g. scipy.sparse.csr + self.format = "Sparse" + self.delimiter = "" else: - raise ValueError("Only Tabbed and Columns are supported input format for the python wrapper, not " + format) + raise ValueError("Only Tabbed, Columns, and Sparse are supported input format for the python wrapper, not " + format) + self.timbloptions = timbloptions self.fileprefix = fileprefix @@ -80,11 +63,17 @@ def __init__(self, fileprefix, timbloptions, format = "Tabbed", dist=True, encod self.instances = [] self.api = None self.debug = debug + self.sklearn = sklearn - if os.path.exists(self.fileprefix + ".train") and overwrite: + if sklearn: + import scipy as sp + self.flushfile = mktemp(prefix=self.fileprefix, dir=flushdir) self.flushed = 0 else: - self.flushed = 1 + if os.path.exists(self.fileprefix + ".train") and overwrite: + self.flushed = 0 + else: + self.flushed = 1 self.threading = threading @@ -94,8 +83,10 @@ def validatefeatures(self,features): for feature in features: if isinstance(feature, int) or isinstance(feature, float): validatedfeatures.append( str(feature) ) - elif self.delimiter in feature: + elif self.delimiter in feature and not self.sklearn: raise ValueError("Feature contains delimiter: " + feature) + elif self.sklearn and isinstance(feature, str): #then is sparse added together + validatedfeatures.append(feature) else: validatedfeatures.append(feature) return validatedfeatures @@ -106,10 +97,10 @@ def append(self, features, classlabel): features = self.validatefeatures(features) - if self.delimiter in classlabel: + if self.delimiter in classlabel and self.delimiter != '': raise ValueError("Class label contains delimiter: " + self.delimiter) - self.instances.append(self.delimiter.join(features) + self.delimiter + classlabel) + self.instances.append(self.delimiter.join(features) + (self.delimiter if not self.delimiter == '' else ' ') + classlabel) if len(self.instances) >= self.flushthreshold: self.flush() @@ -117,10 +108,13 @@ def flush(self): if self.debug: print("Flushing...",file=sys.stderr) if len(self.instances) == 0: return False - if self.flushed: - f = io.open(self.fileprefix + ".train",'a', encoding=self.encoding) + if hasattr(self, 'flushfile'): + f = io.open(self.flushfile,'w', encoding=self.encoding) else: - f = io.open(self.fileprefix + ".train",'w', encoding=self.encoding) + if self.flushed: + f = io.open(self.fileprefix + ".train",'a', encoding=self.encoding) + else: + f = io.open(self.fileprefix + ".train",'w', encoding=self.encoding) for instance in self.instances: f.write(instance + "\n") @@ -135,22 +129,29 @@ def __delete__(self): def train(self, save=False): self.flush() - if not os.path.exists(self.fileprefix + ".train"): - raise LoadException("Training file '"+self.fileprefix+".train' not found. Did you forget to add instances with append()?") + + if hasattr(self, 'flushfile'): + if not os.path.exists(self.flushfile): + raise LoadException("Training file '"+self.flushfile+"' not found. Did you forget to add instances with append()?") + else: + filepath = self.flushfile + else: + if not os.path.exists(self.fileprefix + ".train"): + raise LoadException("Training file '"+self.fileprefix+".train' not found. Did you forget to add instances with append()?") + else: + filepath = self.fileprefix + '.train' + options = "-F " + self.format + " " + self.timbloptions if self.dist: options += " +v+db +v+di" print("Calling Timbl API for training: " + options, file=stderr) - if sys.version < '3': - self.api = timblapi.TimblAPI(b(options), b"") - else: - self.api = timblapi.TimblAPI(options,"") + self.api = timblapi.TimblAPI(options,"") if self.debug: print("Enabling debug for timblapi",file=stderr) self.api.enableDebug() - trainfile = self.fileprefix + ".train" - self.api.learn(b(trainfile)) + trainfile = filepath + self.api.learn(trainfile) if save: self.save() if self.threading: @@ -159,8 +160,8 @@ def train(self, save=False): def save(self): if not self.api: raise Exception("No API instantiated, did you train the classifier first?") - self.api.writeInstanceBase(b(self.fileprefix + ".ibase")) - self.api.saveWeights(b(self.fileprefix + ".wgt")) + self.api.writeInstanceBase(self.fileprefix + ".ibase") + self.api.saveWeights(self.fileprefix + ".wgt") def classify(self, features, allowtopdistribution=True): @@ -168,12 +169,13 @@ def classify(self, features, allowtopdistribution=True): if not self.api: self.load() - testinstance = self.delimiter.join(features) + self.delimiter + "?" + + testinstance = self.delimiter.join(features) + (self.delimiter if not self.delimiter == '' else ' ') + "?" if self.dist: if self.threading: - result, cls, distribution, distance = self.api.classify3safe(b(testinstance), self.normalize, int(not allowtopdistribution)) + result, cls, distribution, distance = self.api.classify3safe(testinstance, self.normalize, int(not allowtopdistribution)) else: - result, cls, distribution, distance = self.api.classify3(b(testinstance), self.normalize, int(not allowtopdistribution)) + result, cls, distribution, distance = self.api.classify3(testinstance, self.normalize, int(not allowtopdistribution)) if result: cls = u(cls) return (cls, distribution, distance) @@ -202,15 +204,12 @@ def load(self): raise LoadException("Instance base '"+self.fileprefix+".ibase' not found, did you train and save the classifier first?") options = "-F " + self.format + " " + self.timbloptions - if sys.version < '3': - self.api = timblapi.TimblAPI(b(options), b"") - else: - self.api = timblapi.TimblAPI(options, "") + self.api = timblapi.TimblAPI(options, "") if self.debug: print("Enabling debug for timblapi",file=stderr) self.api.enableDebug() print("Calling Timbl API : " + options,file=stderr) - self.api.getInstanceBase(b(self.fileprefix + '.ibase')) + self.api.getInstanceBase(self.fileprefix + '.ibase') #if os.path.exists(self.fileprefix + ".wgt"): # self.api.getWeights(self.fileprefix + '.wgt') if self.threading: @@ -234,10 +233,7 @@ def test(self, testfile): """Test on an existing testfile and return the accuracy""" if not self.api: self.load() - if sys.version < '3': - self.api.test(b(testfile), b(self.fileprefix + '.out'),b'') - else: - self.api.test(u(testfile), u(self.fileprefix + '.out'),'') + self.api.test(u(testfile), u(self.fileprefix + '.out'),'') return self.api.getAccuracy() @@ -245,18 +241,12 @@ def crossvalidate(self, foldsfile): """Train & Test using cross validation, testfile is a file that contains the filenames of all the folds!""" options = "-F " + self.format + " " + self.timbloptions + " -t cross_validate" print("Instantiating Timbl API : " + options,file=stderr) - if sys.version < '3': - self.api = timblapi.TimblAPI(b(options), b"") - else: - self.api = timblapi.TimblAPI(options, "") + self.api = timblapi.TimblAPI(options, "") if self.debug: print("Enabling debug for timblapi",file=stderr) self.api.enableDebug() print("Calling Timbl Test : " + options,file=stderr) - if sys.version < '3': - self.api.test(b(foldsfile),b'',b'') - else: - self.api.test(u(foldsfile),'','') + self.api.test(u(foldsfile),'','') a = self.api.getAccuracy() del self.api return a @@ -267,20 +257,13 @@ def leaveoneout(self): """Train & Test using leave one out""" traintestfile = self.fileprefix + '.train' options = "-F " + self.format + " " + self.timbloptions + " -t leave_one_out" - if sys.version < '3': - self.api = timblapi.TimblAPI(b(options), b"") - else: - self.api = timblapi.TimblAPI(options, "") + self.api = timblapi.TimblAPI(options, "") if self.debug: print("Enabling debug for timblapi",file=stderr) self.api.enableDebug() print("Calling Timbl API : " + options,file=stderr) - if sys.version < '3': - self.api.learn(b(traintestfile)) - self.api.test(b(traintestfile), b(self.fileprefix + '.out'),b'') - else: - self.api.learn(u(traintestfile)) - self.api.test(u(traintestfile), u(self.fileprefix + '.out'),'') + self.api.learn(u(traintestfile)) + self.api.test(u(traintestfile), u(self.fileprefix + '.out'),'') return self.api.getAccuracy() def readtestoutput(self): @@ -324,6 +307,18 @@ def readtestoutput(self): yield " ".join(segments[:endfvec - 2]).split(self.delimiter), segments[endfvec - 2], segments[endfvec - 1], distribution, distance f.close() + def bestNeighbours(self): + return self.api.bestNeighbours() + + def bestNeighbors(self): + return self.api.bestNeighbours() + + def settings(self): + return self.api.settings() + + def options(self): + return self.api.options() + def _parsedistribution(self, instance, start=0, end =None): dist = {} i = start + 1 @@ -347,8 +342,3 @@ def _parsedistribution(self, instance, start=0, end =None): return dist - - - - - diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..af2ee70 --- /dev/null +++ b/utils.py @@ -0,0 +1,122 @@ +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils import check_X_y, check_array +from timbl import TimblClassifier +import scipy as sp +import numpy as np + +class skTiMBL(BaseEstimator, ClassifierMixin): + def __init__(self, prefix='timbl', algorithm=4, dist_metric=None, + k=1, normalize=False, debug=0, flushdir=None): + self.prefix = prefix + self.algorithm = algorithm + self.dist_metric = dist_metric + self.k = k + self.normalize = normalize + self.debug = debug + self.flushdir = flushdir + + + def _make_timbl_options(self, *options): + """ + -a algorithm + -m metric + -w weighting + -k amount of neighbours + -d class voting weights + -L frequency threshold + -T which feature index is label + -N max number of features + -H turn hashing on/off + + This function still has to be made, for now the appropriate arguments + can be passed in fit() + """ + pass + + + def fit(self, X, y): + X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr') + + n_rows = X.shape[0] + self.classes_ = np.unique(y) + + if sp.sparse.issparse(X): + if self.debug: print('Features are sparse, choosing faster learning') + + self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]), + format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir, + flushthreshold=20000, normalize=self.normalize) + + for i in range(n_rows): + sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)] + self.classifier.append(sparse,str(y[i])) + + else: + + self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]), + debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000, + normalize=self.normalize) + + if y.dtype != 'O': + y = y.astype(str) + + for i in range(n_rows): + self.classifier.append(list(X[i].toarray()[0]), y[i]) + + self.classifier.train() + return self + + + def _timbl_predictions(self, X, part_index, y=None): + choices = {0 : lambda x : x.append(np.int64(label)), + 1 : lambda x : x.append([np.float(distance)]), + } + X = check_array(X, dtype=np.float64, accept_sparse='csr') + + n_samples = X.shape[0] + + pred = [] + func = choices[part_index] + if sp.sparse.issparse(X): + if self.debug: print('Features are sparse, choosing faster predictions') + + for i in range(n_samples): + sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)] + label,proba, distance = self.classifier.classify(sparse) + func(pred) + + else: + for i in range(n_samples): + label,proba, distance = self.classifier.classify(list(X[i].toarray()[0])) + func(pred) + + return np.array(pred) + + + + def predict(self, X, y=None): + return self._timbl_predictions(X, part_index=0) + + + def predict_proba(self, X, y=None): + """ + TIMBL is a discrete classifier. It cannot give probability estimations. + To ensure that scikit-learn functions with TIMBL (and especially metrics + such as ROC_AUC), this method is implemented. + + For ROC_AUC, the classifier corresponds to a single point in ROC space, + instead of a probabilistic continuum such as classifiers that can give + a probability estimation (e.g. Linear classifiers). For an explanation, + see Fawcett (2005). + """ + return predict(X) + + + def decision_function(self, X, y=None): + """ + The decision function is interpreted here as being the distance between + the instance that is being classified and the nearest point in k space. + """ + return self._timbl_predictions(X, part_index=1) + +