[ner] Remove unused files and move tests, related to #187461

authorvincent.michel@logilab.fr
changeset1cff8a50b49f
branchdefault
phasedraft
hiddenyes
parent revision#8d5fa29eead4 [old api] Remove deprecated old API, closes #197016
child revision#b323882735ec [utils] Create an utils folder, related to #187461
files modified by this revision
ner/__pkginfo__.py
ner/debian/changelog
ner/debian/compat
ner/debian/control
ner/debian/copyright
ner/debian/rules
ner/python-nerdy.spec
ner/setup.py
ner/stopwords.py
ner/test/test_core.py
ner/test/test_dataio.py
ner/test/test_filter.py
ner/test/test_preprocessor.py
ner/test/test_tokenizer.py
reference_data/stopwords.py
reference_data/us_states.txt
test/test_core.py
test/test_filter.py
test/test_ner_dataio.py
test/test_preprocessor.py
test/test_tokenizer.py
# HG changeset patch
# User vincent.michel@logilab.fr
# Date 1387464284 0
# Thu Dec 19 14:44:44 2013 +0000
# Node ID 1cff8a50b49f3eab55783615a9835b4827c6ced7
# Parent 8d5fa29eead42e2992c3cfd0bd90fae7a95aed5c
[ner] Remove unused files and move tests, related to #187461

diff --git a/ner/__pkginfo__.py b/ner/__pkginfo__.py
@@ -1,39 +0,0 @@
1 -# -*- coding:utf-8 -*-
2 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
3 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
4 -#
5 -# This program is free software: you can redistribute it and/or modify it under
6 -# the terms of the GNU Lesser General Public License as published by the Free
7 -# Software Foundation, either version 2.1 of the License, or (at your option)
8 -# any later version.
9 -#
10 -# This program is distributed in the hope that it will be useful, but WITHOUT
11 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
13 -# details.
14 -#
15 -# You should have received a copy of the GNU Lesser General Public License along
16 -# with this program. If not, see <http://www.gnu.org/licenses/>.
17 -"""Nerdy packaging information."""
18 -__docformat__ = "restructuredtext en"
19 -import sys
20 -
21 -distname = 'nerdy'
22 -modname = 'nerdy'
23 -
24 -numversion = (0, 1, 0)
25 -version = '.'.join([str(num) for num in numversion])
26 -
27 -license = 'LGPL' # 2.1 or later
28 -description = "Python library for data alignment"
29 -web = "https://www.logilab.org/project/nerdy"
30 -author = "Logilab"
31 -author_email = "contact@logilab.fr"
32 -
33 -
34 -from os.path import join
35 -scripts = []
36 -include_dirs = []
37 -
38 -if sys.version_info < (2, 7):
39 -    install_requires = ['unittest2 >= 0.5.1']
diff --git a/ner/debian/changelog b/ner/debian/changelog
@@ -1,6 +0,0 @@
40 -nerdy (0.1.0-1) unstable; urgency=low
41 -
42 -  * Initial release of the Nerdy package for Named Entities Recognition in Python.
43 -
44 - -- Vincent michel <Vincent.Michel@logilab.fr>  Tue, 11 Jun 2013 13:59:22 +0200
45 -
diff --git a/ner/debian/compat b/ner/debian/compat
@@ -1,1 +0,0 @@
46 -7
diff --git a/ner/debian/control b/ner/debian/control
@@ -1,12 +0,0 @@
47 -Source: nerdy
48 -Section: python
49 -Priority: optional
50 -Maintainer: LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
51 -Build-Depends: debhelper (>= 7), python (>=2.5), python-support
52 -Standards-Version: 3.9.3
53 -XS-Python-Version: >= 2.5
54 -
55 -Package: python-nerdy
56 -Architecture: all
57 -Depends: ${python:Depends}
58 -Description: Python library for Named Entities Recognition.
diff --git a/ner/debian/copyright b/ner/debian/copyright
@@ -1,8 +0,0 @@
59 -Upstream Author:
60 -
61 -  LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
62 -
63 -Copyright:
64 -
65 -Copyright (c) 2013 LOGILAB S.A. (Paris, FRANCE).
66 -http://www.logilab.fr -- mailto:contact@logilab.fr
diff --git a/ner/debian/rules b/ner/debian/rules
@@ -1,55 +0,0 @@
67 -#!/usr/bin/make -f
68 -# Sample debian/rules that uses debhelper.
69 -# GNU copyright 1997 to 1999 by Joey Hess.
70 -
71 -# Uncomment this to turn on verbose mode.
72 -#export DH_VERBOSE=1
73 -build: build-arch build-indep
74 -build-arch:
75 -	# Nothing to do
76 -build-indep: build-stamp
77 -build-stamp:
78 -	dh_testdir
79 -	NO_SETUPTOOLS=1 python setup.py -q build
80 -	touch build-stamp
81 -
82 -clean:
83 -	dh_testdir
84 -	dh_testroot
85 -	rm -f build-stamp configure-stamp
86 -	rm -rf build
87 -	find . -name "*.pyc" | xargs rm -f
88 -	dh_clean
89 -
90 -install: build
91 -	dh_testdir
92 -	dh_testroot
93 -	dh_clean -k
94 -	dh_installdirs -i
95 -	NO_SETUPTOOLS=1 python setup.py -q install --no-compile --prefix=debian/python-nerdy/usr/
96 -
97 -
98 -# Build architecture-independent files here.
99 -binary-indep: build install
100 -	dh_testdir
101 -	dh_testroot
102 -	dh_install -i
103 -	dh_installchangelogs -i
104 -	dh_installexamples -i
105 -	dh_installdocs -i
106 -	dh_installman -i
107 -	dh_pysupport -i
108 -	dh_link -i
109 -	dh_compress -i -X.py -X.ini -X.xml -Xtest
110 -	dh_fixperms -i
111 -	dh_installdeb -i
112 -	dh_gencontrol -i
113 -	dh_md5sums -i
114 -	dh_builddeb -i
115 -
116 -
117 -# Build architecture-dependent files here.
118 -binary-arch:
119 -
120 -binary: binary-indep
121 -.PHONY: build clean binary-arch binary-indep binary
diff --git a/ner/python-nerdy.spec b/ner/python-nerdy.spec
@@ -1,48 +0,0 @@
122 -%if 0%{?el5}
123 -%define python python26
124 -%define __python /usr/bin/python2.6
125 -%{!?python_scriptarch: %define python_scriptarch %(%{__python} -c "from distutils.sysconfig import get_python_lib; from os.path import join; print join(get_python_lib(1, 1), 'scripts')")}
126 -%else
127 -%define python python
128 -%define __python /usr/bin/python
129 -%endif
130 -
131 -Name:           %{python}-nerdy
132 -Version:        0.1.0
133 -Release:        logilab.1%{?dist}
134 -Summary:        Python library for data alignment
135 -Group:          Development/Languages/Python
136 -License:        LGPL
137 -Source0:        nerdy-%{version}.tar.gz
138 -
139 -BuildArch:      noarch
140 -BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-buildroot
141 -
142 -BuildRequires:  %{python}
143 -Requires:       %{python}, %{python}-lxml
144 -
145 -
146 -%description
147 -entity / relation schema
148 -
149 -%prep
150 -%setup -q -n nerdy-%{version}
151 -
152 -%build
153 -%{__python} setup.py build
154 -%if 0%{?el5}
155 -# change the python version in shebangs
156 -find . -name '*.py' -type f -print0 |  xargs -0 sed -i '1,3s;^#!.*python.*$;#! /usr/bin/python2.6;'
157 -%endif
158 -
159 -%install
160 -rm -rf $RPM_BUILD_ROOT
161 -NO_SETUPTOOLS=1 %{__python} setup.py install -O1 --skip-build --root $RPM_BUILD_ROOT %{?python_scriptarch: --install-scripts=%{python_scriptarch}}
162 -
163 -%clean
164 -rm -rf $RPM_BUILD_ROOT
165 -
166 -%files 
167 -%defattr(-, root, root)
168 -/*
169 -
diff --git a/ner/setup.py b/ner/setup.py
@@ -1,27 +0,0 @@
170 -# -*- coding:utf-8 -*-
171 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
172 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
173 -#
174 -# This program is free software: you can redistribute it and/or modify it under
175 -# the terms of the GNU Lesser General Public License as published by the Free
176 -# Software Foundation, either version 2.1 of the License, or (at your option)
177 -# any later version.
178 -#
179 -# This program is distributed in the hope that it will be useful, but WITHOUT
180 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
181 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
182 -# details.
183 -#
184 -# You should have received a copy of the GNU Lesser General Public License along
185 -# with this program. If not, see <http://www.gnu.org/licenses/>.
186 -from distutils.core import setup
187 -
188 -setup(name='nerdy',
189 -      version='0.1.0',
190 -      description='Python library for data alignment',
191 -      author='LOGILAB S.A. (Paris, FRANCE)',
192 -      author_email=' <contact@logilab.fr>',
193 -      url='https://www.logilab.org/project/nerdy',
194 -      package_dir={'nerdy': '.'},
195 -      packages=['nerdy'],
196 -     )
diff --git a/ner/stopwords.py b/ner/stopwords.py
@@ -1,15 +0,0 @@
197 -# -*- coding: utf-8 -*-
198 -"""
199 -Stopwords in different languages.
200 -"""
201 -
202 -FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
203 -
204 -
205 -ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
206 -
207 -
208 -ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
209 -
210 -
211 -ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
diff --git a/ner/test/test_core.py b/ner/test/test_core.py
@@ -1,225 +0,0 @@
212 -# -*- coding:utf-8 -*-
213 -#
214 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
215 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
216 -#
217 -# This program is free software: you can redistribute it and/or modify it under
218 -# the terms of the GNU Lesser General Public License as published by the Free
219 -# Software Foundation, either version 2.1 of the License, or (at your option)
220 -# any later version.
221 -#
222 -# This program is distributed in the hope that it will be useful, but WITHOUT
223 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
224 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
225 -# details.
226 -#
227 -# You should have received a copy of the GNU Lesser General Public License along
228 -# with this program. If not, see <http://www.gnu.org/licenses/>.
229 -import unittest2
230 -
231 -from nerdy import core
232 -from nerdy.tokenizer import Token, Sentence
233 -
234 -
235 -class CoreTest(unittest2.TestCase):
236 -    """ Test of core """
237 -
238 -    def test_lexical_source(self):
239 -        """ Test lexical source """
240 -        lexicon = {'everyone': 'http://example.com/everyone',
241 -                   'me': 'http://example.com/me'}
242 -        source = core.NerdySourceLexical(lexicon)
243 -        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
244 -        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
245 -        self.assertEqual(source.query_word('me everyone'), [])
246 -        self.assertEqual(source.query_word('toto'), [])
247 -        # Token
248 -        token = Token('me', 0, 2, None)
249 -        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
250 -        token = Token('ma', 0, 2, None)
251 -        self.assertEqual(source.recognize_token(token), [])
252 -
253 -    def test_rql_source(self):
254 -        """ Test rql source """
255 -        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
256 -                                       'http://www.cubicweb.org')
257 -        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
258 -
259 -    def test_sparql_source(self):
260 -        """ Test sparql source """
261 -        source = core.NerdySourceSparql(u'''SELECT ?uri
262 -                                            WHERE{
263 -                                            ?uri rdfs:label "Python"@en .
264 -                                            ?uri rdf:type ?type}''',
265 -                                        u'http://dbpedia.org/sparql')
266 -        self.assertEqual(source.query_word('cubicweb'),
267 -                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
268 -                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
269 -
270 -    def test_nerdy_process(self):
271 -        """ Test nerdy process """
272 -        text = 'Hello everyone, this is   me speaking. And me.'
273 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
274 -                                          'me': 'http://example.com/me'})
275 -        nerdy = core.NerdyProcess((source,))
276 -        named_entities = nerdy.process_text(text)
277 -        self.assertEqual(named_entities,
278 -                         [('http://example.com/everyone', None,
279 -                           Token(word='everyone', start=6, end=14,
280 -                                           sentence=Sentence(indice=0, start=0, end=38))),
281 -                          ('http://example.com/me', None,
282 -                           Token(word='me', start=26, end=28,
283 -                                           sentence=Sentence(indice=0, start=0, end=38))),
284 -                          ('http://example.com/me', None,
285 -                           Token(word='me', start=43, end=45,
286 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
287 -
288 -    def test_nerdy_process_multisources(self):
289 -        """ Test nerdy process """
290 -        text = 'Hello everyone, this is   me speaking. And me.'
291 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
292 -                                          'me': 'http://example.com/me'})
293 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
294 -        # Two sources, not unique
295 -        nerdy = core.NerdyProcess((source1, source2))
296 -        named_entities = nerdy.process_text(text)
297 -        self.assertEqual(named_entities,
298 -                         [('http://example.com/everyone', None,
299 -                           Token(word='everyone', start=6, end=14,
300 -                                           sentence=Sentence(indice=0, start=0, end=38))),
301 -                          ('http://example.com/me', None,
302 -                           Token(word='me', start=26, end=28,
303 -                                           sentence=Sentence(indice=0, start=0, end=38))),
304 -                          ('http://example2.com/me', None,
305 -                           Token(word='me', start=26, end=28,
306 -                                           sentence=Sentence(indice=0, start=0, end=38))),
307 -                          ('http://example.com/me', None,
308 -                           Token(word='me', start=43, end=45,
309 -                                           sentence=Sentence(indice=1, start=38, end=46))),
310 -                          ('http://example2.com/me', None,
311 -                           Token(word='me', start=43, end=45,
312 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
313 -        # Two sources, unique
314 -        nerdy = core.NerdyProcess((source1, source2), unique=True)
315 -        named_entities = nerdy.process_text(text)
316 -        self.assertEqual(named_entities,
317 -                         [('http://example.com/everyone', None,
318 -                           Token(word='everyone', start=6, end=14,
319 -                                           sentence=Sentence(indice=0, start=0, end=38))),
320 -                          ('http://example.com/me', None,
321 -                           Token(word='me', start=26, end=28,
322 -                                           sentence=Sentence(indice=0, start=0, end=38))),
323 -                          ('http://example.com/me', None,
324 -                           Token(word='me', start=43, end=45,
325 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
326 -        # Two sources inversed, unique
327 -        nerdy = core.NerdyProcess((source2, source1), unique=True)
328 -        named_entities = nerdy.process_text(text)
329 -        self.assertEqual(named_entities,
330 -                         [('http://example.com/everyone', None,
331 -                           Token(word='everyone', start=6, end=14,
332 -                                           sentence=Sentence(indice=0, start=0, end=38))),
333 -                          ('http://example2.com/me', None,
334 -                           Token(word='me', start=26, end=28,
335 -                                           sentence=Sentence(indice=0, start=0, end=38))),
336 -                          ('http://example2.com/me', None,
337 -                           Token(word='me', start=43, end=45,
338 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
339 -
340 -    def test_nerdy_process_add_sources(self):
341 -        """ Test nerdy process """
342 -        text = 'Hello everyone, this is   me speaking. And me.'
343 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
344 -                                          'me': 'http://example.com/me'})
345 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
346 -        nerdy = core.NerdyProcess((source1,))
347 -        named_entities = nerdy.process_text(text)
348 -        self.assertEqual(named_entities,
349 -                         [('http://example.com/everyone', None,
350 -                           Token(word='everyone', start=6, end=14,
351 -                                           sentence=Sentence(indice=0, start=0, end=38))),
352 -                          ('http://example.com/me', None,
353 -                           Token(word='me', start=26, end=28,
354 -                                           sentence=Sentence(indice=0, start=0, end=38))),
355 -                          ('http://example.com/me', None,
356 -                           Token(word='me', start=43, end=45,
357 -                                           sentence=Sentence(indice=1, start=38, end=46))),])
358 -        # Two sources, not unique
359 -        nerdy.add_ner_source(source2)
360 -        named_entities = nerdy.process_text(text)
361 -        self.assertEqual(named_entities,
362 -                         [('http://example.com/everyone', None,
363 -                           Token(word='everyone', start=6, end=14,
364 -                                           sentence=Sentence(indice=0, start=0, end=38))),
365 -                          ('http://example.com/me', None,
366 -                           Token(word='me', start=26, end=28,
367 -                                           sentence=Sentence(indice=0, start=0, end=38))),
368 -                          ('http://example2.com/me', None,
369 -                           Token(word='me', start=26, end=28,
370 -                                           sentence=Sentence(indice=0, start=0, end=38))),
371 -                          ('http://example.com/me', None,
372 -                           Token(word='me', start=43, end=45,
373 -                                           sentence=Sentence(indice=1, start=38, end=46))),
374 -                          ('http://example2.com/me', None,
375 -                           Token(word='me', start=43, end=45,
376 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
377 -
378 -    def test_nerdy_process_preprocess(self):
379 -        """ Test nerdy process """
380 -        text = 'Hello Toto, this is   me speaking. And me.'
381 -        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
382 -                                          'me': 'http://example.com/me'})
383 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
384 -        nerdy = core.NerdyProcess((source,),
385 -                                  preprocessors=(preprocessor,))
386 -        named_entities = nerdy.process_text(text)
387 -        self.assertEqual(named_entities, [('http://example.com/toto', None,
388 -                                           Token(word='Toto', start=6, end=10,
389 -                                                 sentence=Sentence(indice=0, start=0, end=34)))])
390 -
391 -    def test_nerdy_process_add_preprocess(self):
392 -        """ Test nerdy process """
393 -        text = 'Hello Toto, this is   me speaking. And me.'
394 -        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
395 -                                          'me': 'http://example.com/me'})
396 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
397 -        nerdy = core.NerdyProcess((source,),)
398 -        named_entities = nerdy.process_text(text)
399 -        self.assertEqual(named_entities,
400 -                         [('http://example.com/toto', None,
401 -                           Token(word='Toto', start=6, end=10,
402 -                                 sentence=Sentence(indice=0, start=0, end=34))),
403 -                          ('http://example.com/me', None,
404 -                           Token(word='me', start=22, end=24,
405 -                                 sentence=Sentence(indice=0, start=0, end=34))),
406 -                          ('http://example.com/me', None,
407 -                           Token(word='me', start=39, end=41,
408 -                                 sentence=Sentence(indice=1, start=34, end=42)))])
409 -        nerdy.add_preprocessors(preprocessor)
410 -        named_entities = nerdy.process_text(text)
411 -        self.assertEqual(named_entities, [('http://example.com/toto', None,
412 -                                           Token(word='Toto', start=6, end=10,
413 -                                                 sentence=Sentence(indice=0, start=0, end=34)))])
414 -
415 -    def test_nerdy_process_chained_word(self):
416 -        """ Test nerdy process """
417 -        text = 'Hello everyone me, this is   me speaking. And me.'
418 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
419 -                                          'everyone me': 'http://example.com/everyone_me',
420 -                                          'me': 'http://example.com/me'})
421 -        nerdy = core.NerdyProcess((source,))
422 -        named_entities = nerdy.process_text(text)
423 -        self.assertEqual(named_entities,
424 -                         [('http://example.com/everyone_me', None,
425 -                           Token(word='everyone me', start=6, end=17,
426 -                                 sentence=Sentence(indice=0, start=0, end=41))),
427 -                          ('http://example.com/me', None,
428 -                           Token(word='me', start=29, end=31,
429 -                                 sentence=Sentence(indice=0, start=0, end=41))),
430 -                          ('http://example.com/me', None,
431 -                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
432 -
433 -
434 -if __name__ == '__main__':
435 -    unittest2.main()
436 -
diff --git a/ner/test/test_dataio.py b/ner/test/test_dataio.py
@@ -1,85 +0,0 @@
437 -# -*- coding:utf-8 -*-
438 -#
439 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
440 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
441 -#
442 -# This program is free software: you can redistribute it and/or modify it under
443 -# the terms of the GNU Lesser General Public License as published by the Free
444 -# Software Foundation, either version 2.1 of the License, or (at your option)
445 -# any later version.
446 -#
447 -# This program is distributed in the hope that it will be useful, but WITHOUT
448 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
449 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
450 -# details.
451 -#
452 -# You should have received a copy of the GNU Lesser General Public License along
453 -# with this program. If not, see <http://www.gnu.org/licenses/>.
454 -import unittest2
455 -
456 -from nerdy import dataio, core
457 -
458 -
459 -class DataioTest(unittest2.TestCase):
460 -    """ Test of dataio """
461 -
462 -    def test_sparql_query(self):
463 -        results = dataio.sparql_query(query=u'''SELECT ?uri
464 -                                                WHERE{
465 -                                                ?uri rdfs:label "Python"@en .
466 -                                                ?uri rdf:type ?type}''',
467 -                                      endpoint=u'http://dbpedia.org/sparql')
468 -        truth = [{u'uri':
469 -                  {u'type': u'uri',
470 -                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}},
471 -                 {u'uri':
472 -                  {u'type': u'uri',
473 -                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}]
474 -        self.assertEqual(results, truth)
475 -
476 -    def test_rql_url_query(self):
477 -        results = dataio.rql_url_query('Any U LIMIT 1 WHERE X cwuri U, X name "apycot"',
478 -                                       'http://www.cubicweb.org')
479 -        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
480 -
481 -    def test_prettyprint(self):
482 -        text = 'Hello everyone, this is   me speaking. And me.'
483 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
484 -                                          'me': 'http://example.com/me'})
485 -        nerdy = core.NerdyProcess((source,))
486 -        named_entities = nerdy.process_text(text)
487 -        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
488 -        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
489 -                                u'this is   <a href="http://example.com/me">me</a> speaking. '
490 -                                u'And <a href="http://example.com/me">me</a>.'))
491 -
492 -    def test_prettyprint_class(self):
493 -        text = 'Hello everyone, this is   me speaking. And me.'
494 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
495 -                                          'me': 'http://example.com/me'})
496 -        nerdy = core.NerdyProcess((source,))
497 -        named_entities = nerdy.process_text(text)
498 -        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
499 -        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
500 -                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
501 -                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
502 -
503 -
504 -class NerdyValidXHTMLPrettyPrintTest(unittest2.TestCase):
505 -
506 -    def test_valid(self):
507 -        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
508 -            '<p>coucou</p>'))
509 -
510 -    def test_valid_unicode(self):
511 -        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
512 -            u'<p>hé</p>'))
513 -
514 -    def test_invalid(self):
515 -        self.assertFalse(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
516 -            '<p><div>coucou</div></p>'))
517 -
518 -
519 -if __name__ == '__main__':
520 -    unittest2.main()
521 -
diff --git a/ner/test/test_filter.py b/ner/test/test_filter.py
@@ -1,99 +0,0 @@
522 -# -*- coding:utf-8 -*-
523 -#
524 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
525 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
526 -#
527 -# This program is free software: you can redistribute it and/or modify it under
528 -# the terms of the GNU Lesser General Public License as published by the Free
529 -# Software Foundation, either version 2.1 of the License, or (at your option)
530 -# any later version.
531 -#
532 -# This program is distributed in the hope that it will be useful, but WITHOUT
533 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
534 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
535 -# details.
536 -#
537 -# You should have received a copy of the GNU Lesser General Public License along
538 -# with this program. If not, see <http://www.gnu.org/licenses/>.
539 -import unittest2
540 -
541 -from nerdy import core
542 -from nerdy.tokenizer import Token, Sentence
543 -
544 -
545 -class FilterTest(unittest2.TestCase):
546 -    """ Test of filters """
547 -
548 -    def test_occurence_filter_min_occ(self):
549 -        """ Test occurence filter """
550 -        text = 'Hello everyone, this is   me speaking. And me.'
551 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
552 -                                          'me': 'http://example.com/me'})
553 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
554 -        _filter = core.NerdyOccurenceFilter(min_occ=2)
555 -        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
556 -        named_entities = nerdy.process_text(text)
557 -        self.assertEqual(named_entities,
558 -                         [('http://example.com/me', None,
559 -                           Token(word='me', start=26, end=28,
560 -                                           sentence=Sentence(indice=0, start=0, end=38))),
561 -                          ('http://example2.com/me', None,
562 -                           Token(word='me', start=26, end=28,
563 -                                           sentence=Sentence(indice=0, start=0, end=38))),
564 -                          ('http://example.com/me', None,
565 -                           Token(word='me', start=43, end=45,
566 -                                           sentence=Sentence(indice=1, start=38, end=46))),
567 -                          ('http://example2.com/me', None,
568 -                           Token(word='me', start=43, end=45,
569 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
570 -
571 -    def test_occurence_filter_max_occ(self):
572 -        """ Test occurence filter """
573 -        text = 'Hello everyone, this is   me speaking. And me.'
574 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
575 -                                          'me': 'http://example.com/me'})
576 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
577 -        _filter = core.NerdyOccurenceFilter(max_occ=1)
578 -        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
579 -        named_entities = nerdy.process_text(text)
580 -        self.assertEqual(named_entities,
581 -                         [('http://example.com/everyone', None,
582 -                           Token(word='everyone', start=6, end=14,
583 -                                           sentence=Sentence(indice=0, start=0, end=38))),])
584 -
585 -    def test_disambiguation_word_length(self):
586 -        """ Test occurence filter """
587 -        text = 'Hello toto tutu. And toto.'
588 -        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
589 -                                          'toto': 'http://example.com/toto'})
590 -        _filter = core.NerdyDisambiguationWordParts()
591 -        nerdy = core.NerdyProcess((source,), filters=(_filter,))
592 -        named_entities = nerdy.process_text(text)
593 -        self.assertEqual(named_entities,
594 -                         [('http://example.com/toto_tutu', None,
595 -                           Token(word='toto tutu', start=6, end=15,
596 -                                 sentence=Sentence(indice=0, start=0, end=16))),
597 -                          ('http://example.com/toto_tutu', None,
598 -                           Token(word='toto', start=21, end=25,
599 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
600 -
601 -    def test_rules_filter(self):
602 -        """ Test rules filter """
603 -        text = 'Hello toto tutu. And toto.'
604 -        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
605 -                                          'toto': 'http://example.com/toto'})
606 -        rules = {'http://example.com/toto': 'http://example.com/tata'}
607 -        _filter = core.NerdyReplacementRulesFilter(rules)
608 -        nerdy = core.NerdyProcess((source,), filters=(_filter,))
609 -        named_entities = nerdy.process_text(text)
610 -        self.assertEqual(named_entities,
611 -                         [('http://example.com/toto_tutu', None,
612 -                           Token(word='toto tutu', start=6, end=15,
613 -                                 sentence=Sentence(indice=0, start=0, end=16))),
614 -                          ('http://example.com/tata', None,
615 -                           Token(word='toto', start=21, end=25,
616 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
617 -
618 -if __name__ == '__main__':
619 -    unittest2.main()
620 -
diff --git a/ner/test/test_preprocessor.py b/ner/test/test_preprocessor.py
@@ -1,97 +0,0 @@
621 -# -*- coding:utf-8 -*-
622 -#
623 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
624 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
625 -#
626 -# This program is free software: you can redistribute it and/or modify it under
627 -# the terms of the GNU Lesser General Public License as published by the Free
628 -# Software Foundation, either version 2.1 of the License, or (at your option)
629 -# any later version.
630 -#
631 -# This program is distributed in the hope that it will be useful, but WITHOUT
632 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
633 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
634 -# details.
635 -#
636 -# You should have received a copy of the GNU Lesser General Public License along
637 -# with this program. If not, see <http://www.gnu.org/licenses/>.
638 -import unittest2
639 -
640 -from nerdy import core, tokenizer
641 -
642 -
643 -class PreprocessorTest(unittest2.TestCase):
644 -    """ Test of preprocessors """
645 -
646 -    def test_lowercasefilter(self):
647 -        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
648 -        token = tokenizer.Token('toto', 0, 4, None)
649 -        self.assertEqual(preprocessor(token), None)
650 -        token = tokenizer.Token('toto Tata', 0, 4, None)
651 -        self.assertEqual(preprocessor(token), token)
652 -        token = tokenizer.Token('toto tata', 0, 4, None)
653 -        self.assertEqual(preprocessor(token), None)
654 -
655 -    def test_wordsizefilter(self):
656 -        preprocessor = core.NerdyWordSizeFilterPreprocessor()
657 -        token = tokenizer.Token('toto', 0, 4, None)
658 -        self.assertEqual(preprocessor(token), token)
659 -        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
660 -        token = tokenizer.Token('toto', 0, 4, None)
661 -        self.assertEqual(preprocessor(token), token)
662 -        token = tokenizer.Token('to', 0, 4, None)
663 -        self.assertEqual(preprocessor(token), None)
664 -        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
665 -        token = tokenizer.Token('toto', 0, 4, None)
666 -        self.assertEqual(preprocessor(token), None)
667 -        token = tokenizer.Token('to', 0, 4, None)
668 -        self.assertEqual(preprocessor(token), token)
669 -
670 -    def test_lowerfirstword(self):
671 -        preprocessor = core.NerdyLowerFirstWordPreprocessor()
672 -        sentence = tokenizer.Sentence(0, 0, 20)
673 -        # Start of the sentence
674 -        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
675 -        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
676 -        self.assertEqual(preprocessor(token1), token2)
677 -        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
678 -        token2 = tokenizer.Token('us tata', 0, 4, sentence)
679 -        self.assertEqual(preprocessor(token1), token2)
680 -        # Not start of the sentence
681 -        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
682 -        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
683 -        self.assertEqual(preprocessor(token1), token2)
684 -        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
685 -        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
686 -        self.assertEqual(preprocessor(token1), token2)
687 -
688 -    def test_stopwordsfilter(self):
689 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
690 -        token = tokenizer.Token('Toto', 0, 4, None)
691 -        self.assertEqual(preprocessor(token), token)
692 -        token = tokenizer.Token('Us', 0, 4, None)
693 -        self.assertEqual(preprocessor(token), None)
694 -        token = tokenizer.Token('Us there', 0, 4, None)
695 -        self.assertEqual(preprocessor(token), token)
696 -        # Split words
697 -        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
698 -        token = tokenizer.Token('Us there', 0, 4, None)
699 -        self.assertEqual(preprocessor(token), None)
700 -        token = tokenizer.Token('Us there toto', 0, 4, None)
701 -        self.assertEqual(preprocessor(token), token)
702 -
703 -    def test_hashtag(self):
704 -        preprocessor = core.NerdyHashTagPreprocessor()
705 -        token = tokenizer.Token('Toto', 0, 4, None)
706 -        self.assertEqual(preprocessor(token), token)
707 -        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
708 -        token2 = tokenizer.Token('BarackObama', 0, 4, None)
709 -        self.assertEqual(preprocessor(token1), token2)
710 -        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
711 -        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
712 -        self.assertEqual(preprocessor(token1), token2)
713 -
714 -
715 -if __name__ == '__main__':
716 -    unittest2.main()
717 -
diff --git a/ner/test/test_tokenizer.py b/ner/test/test_tokenizer.py
@@ -1,88 +0,0 @@
718 -# -*- coding:utf-8 -*-
719 -#
720 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
721 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
722 -#
723 -# This program is free software: you can redistribute it and/or modify it under
724 -# the terms of the GNU Lesser General Public License as published by the Free
725 -# Software Foundation, either version 2.1 of the License, or (at your option)
726 -# any later version.
727 -#
728 -# This program is distributed in the hope that it will be useful, but WITHOUT
729 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
730 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
731 -# details.
732 -#
733 -# You should have received a copy of the GNU Lesser General Public License along
734 -# with this program. If not, see <http://www.gnu.org/licenses/>.
735 -import unittest2
736 -
737 -from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
738 -
739 -
740 -class TokenizerTest(unittest2.TestCase):
741 -    """ Test of tokenizer """
742 -
743 -    def test_richstringtokenizer(self):
744 -        text = 'Hello everyone, this is   me speaking. And me.'
745 -        tokenizer = RichStringTokenizer(text,
746 -                                        token_min_size=1,
747 -                                        token_max_size=3)
748 -        tokens = list(tokenizer)
749 -        self.assertEqual(len(tokens), 18)
750 -        t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
751 -        self.assertEqual(tokens[0], t1)
752 -        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
753 -        self.assertEqual(tokens[16], t2)
754 -
755 -    def test_richstringtokenizer_loadtext(self):
756 -        text = 'Hello everyone, this is   me speaking. And me.'
757 -        tokenizer = RichStringTokenizer(text,
758 -                                        token_min_size=1,
759 -                                        token_max_size=3)
760 -        tokens = list(tokenizer)
761 -        self.assertEqual(len(tokens), 18)
762 -        tokenizer.load_text('Hello everyone')
763 -        tokens = list(tokenizer)
764 -        self.assertEqual(len(tokens), 3)
765 -
766 -    def test_richstringtokenizer_minsize(self):
767 -        text = 'Hello everyone, this is   me speaking. And me.'
768 -        tokenizer = RichStringTokenizer(text,
769 -                                        token_min_size=2,
770 -                                        token_max_size=3)
771 -        tokens = list(tokenizer)
772 -        self.assertEqual(len(tokens), 10)
773 -        t1 =  Token(word='me speaking', start=26, end=37, sentence=Sentence(indice=0, start=0, end=38))
774 -        self.assertEqual(tokens[8], t1)
775 -
776 -    def test_richstringtokenizer_maxsize(self):
777 -        text = 'Hello everyone, this is   me speaking. And me.'
778 -        tokenizer = RichStringTokenizer(text,
779 -                                        token_min_size=1,
780 -                                        token_max_size=4)
781 -        tokens = list(tokenizer)
782 -        self.assertEqual(len(tokens), 21)
783 -        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
784 -        self.assertEqual(tokens[18], t1)
785 -
786 -    def test_richstringtokenizer_sentences(self):
787 -        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
788 -        tokenizer = RichStringTokenizer(text,
789 -                                        token_min_size=1,
790 -                                        token_max_size=4)
791 -        sentences = tokenizer.find_sentences(text)
792 -        self.assertEqual(len(sentences), 4)
793 -        self.assertEqual(text[sentences[0].start:sentences[0].end],
794 -                         'Hello everyone, this is   me speaking.')
795 -        self.assertEqual(text[sentences[1].start:sentences[1].end],
796 -                         ' And me !')
797 -        self.assertEqual(text[sentences[2].start:sentences[2].end],
798 -                         'Why not me ?')
799 -        self.assertEqual(text[sentences[3].start:sentences[3].end],
800 -                         ' Blup')
801 -
802 -
803 -if __name__ == '__main__':
804 -    unittest2.main()
805 -
diff --git a/reference_data/stopwords.py b/reference_data/stopwords.py
@@ -0,0 +1,15 @@
806 +# -*- coding: utf-8 -*-
807 +"""
808 +Stopwords in different languages.
809 +"""
810 +
811 +FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
812 +
813 +
814 +ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
815 +
816 +
817 +ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
818 +
819 +
820 +ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
diff --git a/reference_data/us_states.txt b/reference_data/us_states.txt
@@ -1,210 +0,0 @@
821 -
822 -# See http://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations
823 -# WARNING: The name of each state should be in French
824 -# (e.g. "Floride", not "Florida")
825 -US_STATES = {'AK': 'Alaska',
826 -             'AL': 'Alabama',
827 -             'AR': 'Arkansas',
828 -             'AZ': 'Arizona',
829 -             'Ala.': 'Alabama',
830 -             'Alas.': 'Alaska',
831 -             'Alaska': 'Alaska',
832 -             'Ariz.': 'Arizona',
833 -             'Ark.': 'Arkansas',
834 -             'Az.': 'Arizona',
835 -             'CA': 'Californie',
836 -             'CF': 'Californie',
837 -             'CL': 'Colorado',
838 -             'CO': 'Colorado',
839 -             'CT': 'Connecticut',
840 -             'Ca.': 'Californie',
841 -             'Cal.': 'Californie',
842 -             'Cali.': 'Californie',
843 -             'Calif.': 'Californie',
844 -             'Col.': 'Colorado',
845 -             'Colo.': 'Colorado',
846 -             'Conn.': 'Connecticut',
847 -             'Ct.': 'Connecticut',
848 -             'D.C.': 'District of ColuFederal district',
849 -             'DC': 'District of ColuFederal district',
850 -             'DE': 'Delaware',
851 -             'DL': 'Delaware',
852 -             'De.': 'Delaware',
853 -             'Del.': 'Delaware',
854 -             'FL': 'Floride',
855 -             'Fl.': 'Floride',
856 -             'Fla.': 'Floride',
857 -             'Flor.': 'Floride',
858 -             'GA': u'Géorgie',
859 -             'Ga.': u'Géorgie',
860 -             'H.I.': 'Hawaii',
861 -             'HA': 'Hawaii',
862 -             'HI': 'Hawaii',
863 -             'Hawaii': 'Hawaii',
864 -             'IA': 'Iowa',
865 -             'ID': 'Idaho',
866 -             'IL': 'Illinois',
867 -             'IN': 'Indiana',
868 -             'Ia.': 'Iowa',
869 -             'Id.': 'Idaho',
870 -             'Ida.': 'Idaho',
871 -             'Idaho': 'Idaho',
872 -             'Il.': 'Illinois',
873 -             "Ill's": 'Illinois',
874 -             'Ill.': 'Illinois',
875 -             'Ills.': 'Illinois',
876 -             'In.': 'Indiana',
877 -             'Ind.': 'Indiana',
878 -             'Ioa.': 'Iowa',
879 -             'Iowa': 'Iowa',
880 -             'KA': 'Kansas',
881 -             'KS': 'Kansas',
882 -             'KY': 'Kentucky',
883 -             'Ka.': 'Kansas',
884 -             'Kan.': 'Kansas',
885 -             'Kans.': 'Kansas',
886 -             'Ks.': 'Kansas',
887 -             'Ky.': 'Kentucky',
888 -             'LA': 'Louisiane',
889 -             'La.': 'Louisiane',
890 -             'MA': 'Massachusetts',
891 -             'MC': 'Michigan',
892 -             'MD': 'Maryland',
893 -             'ME': 'Maine',
894 -             'MI': 'Mississippi',
895 -             'MN': 'Minnesota',
896 -             'MO': 'Missouri',
897 -             'MS': 'Mississippi',
898 -             'MT': 'Montana',
899 -             'Maine': 'Maine',
900 -             'Mass.': 'Massachusetts',
901 -             'Md.': 'Maryland',
902 -             'Me.': 'Maine',
903 -             'Mich.': 'Michigan',
904 -             'Minn.': 'Minnesota',
905 -             'Miss.': 'Mississippi',
906 -             'Mn.': 'Minnesota',
907 -             'Mo.': 'Missouri',
908 -             'Mont.': 'Montana',
909 -             'N. Car.': 'Caroline du Nord',
910 -             'N. Dak.': 'Dakota du Nord',
911 -             'N. Mex.': 'Nouveau-Mexique',
912 -             'N. York': 'New York',
913 -             'N.C.': 'Caroline du Nord',
914 -             'N.D.': 'Dakota du Nord',
915 -             'N.H.': 'New Hampshire',
916 -             'N.J.': 'New Jersey',
917 -             'N.M.': 'Nouveau-Mexique',
918 -             'N.Y.': 'New York',
919 -             'NB': 'Nebraska',
920 -             'NC': 'Caroline du Nord',
921 -             'ND': 'Dakota du Nord',
922 -             'NE': 'Nebraska',
923 -             'NH': 'New Hampshire',
924 -             'NJ': 'New Jersey',
925 -             'NM': 'Nouveau-Mexique',
926 -             'NV': 'Nevada',
927 -             'NY': 'New York',
928 -             'Neb.': 'Nebraska',
929 -             'Nebr.': 'Nebraska',
930 -             'Nev.': 'Nevada',
931 -             'New M.': 'Nouveau-Mexique',
932 -             'NoDak': 'Dakota du Nord',
933 -             'Nv.': 'Nevada',
934 -             'O.': 'Ohio',
935 -             'OH': 'Ohio',
936 -             'OK': 'Oklahoma',
937 -             'OR': 'Oregon',
938 -             'Oh.': 'Ohio',
939 -             'Ohio': 'Ohio',
940 -             'Ok.': 'Oklahoma',
941 -             'Okla.': 'Oklahoma',
942 -             'Or.': 'Oregon',
943 -             'Ore.': 'Oregon',
944 -             'Oreg.': 'Oregon',
945 -             'PA': 'Pennsylvanie',
946 -             'Pa.': 'Pennsylvanie',
947 -             'R.I.': 'Rhode Island',
948 -             'R.I. & P.P.': 'Rhode Island',
949 -             'RI': 'Rhode Island',
950 -             'S. Car.': 'Caroline du Sud',
951 -             'S. Dak.': 'Dakota du Sud',
952 -             'S.C.': 'Caroline du Sud',
953 -             'S.D.': 'Dakota du Sud',
954 -             'SC': 'Caroline du Sud',
955 -             'SD': 'Dakota du Sud',
956 -             'SoDak': 'Dakota du Sud',
957 -             'State': 'Utah',
958 -             'TN': 'Tennessee',
959 -             'TX': 'Texas',
960 -             'Tenn.': 'Tennessee',
961 -             'Tex.': 'Texas',
962 -             'Texas': 'Texas',
963 -             'Tn.': 'Tennessee',
964 -             'Tx.': 'Texas',
965 -             'US-AL': 'Alabama',
966 -             'US-AR': 'Arkansas',
967 -             'US-AZ': 'Arizona',
968 -             'US-CA': 'Californie',
969 -             'US-CO': 'Colorado',
970 -             'US-CT': 'Connecticut',
971 -             'US-DC': 'District of ColuFederal district',
972 -             'US-DE': 'Delaware',
973 -             'US-FL': 'Floride',
974 -             'US-GA': u'Géorgie',
975 -             'US-IL': 'Illinois',
976 -             'US-IN': 'Indiana',
977 -             'US-KY': 'Kentucky',
978 -             'US-LA': 'Louisiane',
979 -             'US-MA': 'Massachusetts',
980 -             'US-MD': 'Maryland',
981 -             'US-MI': 'Michigan',
982 -             'US-MN': 'Minnesota',
983 -             'US-MO': 'Missouri',
984 -             'US-MS': 'Mississippi',
985 -             'US-MT': 'Montana',
986 -             'US-NC': 'Caroline du Nord',
987 -             'US-ND': 'Dakota du Nord',
988 -             'US-NE': 'Nebraska',
989 -             'US-NH': 'New Hampshire',
990 -             'US-NJ': 'New Jersey',
991 -             'US-NM': 'Nouveau-Mexique',
992 -             'US-NY': 'New York',
993 -             'US-OK': 'Oklahoma',
994 -             'US-PA': 'Pennsylvanie',
995 -             'US-RI': 'Rhode Island',
996 -             'US-SC': 'Caroline du Sud',
997 -             'US-SD': 'Dakota du Sud',
998 -             'US-TN': 'Tennessee',
999 -             'US-VA': 'Virginia',
1000 -             'US-VT': 'Vermont',
1001 -             'US-WA': 'Washington',
1002 -             'US-WI': 'Wisconsin',
1003 -             'US-WV': 'Virginie occidentale',
1004 -             'US-WY': 'Wyoming',
1005 -             'UT': 'Utah',
1006 -             'Ut.': 'Utah',
1007 -             'Utah': 'Utah',
1008 -             'VA': 'Virginia',
1009 -             'VT': 'Vermont',
1010 -             'Va.': 'Virginia',
1011 -             'Vt.': 'Vermont',
1012 -             'W. Va.': 'Virginie occidentale',
1013 -             'W. Virg.': 'Virginie occidentale',
1014 -             'W.V.': 'Virginie occidentale',
1015 -             'W.Va.': 'Virginie occidentale',
1016 -             'WA': 'Washington',
1017 -             'WI': 'Wisconsin',
1018 -             'WN': 'Washington',
1019 -             'WS': 'Wisconsin',
1020 -             'WV': 'Virginie occidentale',
1021 -             'WY': 'Wyoming',
1022 -             'Wa.': 'Washington',
1023 -             'Wash.': 'Washington',
1024 -             'Wash. D.C.': 'District of ColuFederal district',
1025 -             'Wi.': 'Wisconsin',
1026 -             'Wis.': 'Wisconsin',
1027 -             'Wisc.': 'Wisconsin',
1028 -             'Wn.': 'Washington',
1029 -             'Wy.': 'Wyoming',
1030 -             'Wyo.': 'Wyoming'}
diff --git a/test/test_core.py b/test/test_core.py
@@ -0,0 +1,225 @@
1031 +# -*- coding:utf-8 -*-
1032 +#
1033 +# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1034 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1035 +#
1036 +# This program is free software: you can redistribute it and/or modify it under
1037 +# the terms of the GNU Lesser General Public License as published by the Free
1038 +# Software Foundation, either version 2.1 of the License, or (at your option)
1039 +# any later version.
1040 +#
1041 +# This program is distributed in the hope that it will be useful, but WITHOUT
1042 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1043 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1044 +# details.
1045 +#
1046 +# You should have received a copy of the GNU Lesser General Public License along
1047 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1048 +import unittest2
1049 +
1050 +from nerdy import core
1051 +from nerdy.tokenizer import Token, Sentence
1052 +
1053 +
1054 +class CoreTest(unittest2.TestCase):
1055 +    """ Test of core """
1056 +
1057 +    def test_lexical_source(self):
1058 +        """ Test lexical source """
1059 +        lexicon = {'everyone': 'http://example.com/everyone',
1060 +                   'me': 'http://example.com/me'}
1061 +        source = core.NerdySourceLexical(lexicon)
1062 +        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
1063 +        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
1064 +        self.assertEqual(source.query_word('me everyone'), [])
1065 +        self.assertEqual(source.query_word('toto'), [])
1066 +        # Token
1067 +        token = Token('me', 0, 2, None)
1068 +        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
1069 +        token = Token('ma', 0, 2, None)
1070 +        self.assertEqual(source.recognize_token(token), [])
1071 +
1072 +    def test_rql_source(self):
1073 +        """ Test rql source """
1074 +        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
1075 +                                       'http://www.cubicweb.org')
1076 +        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
1077 +
1078 +    def test_sparql_source(self):
1079 +        """ Test sparql source """
1080 +        source = core.NerdySourceSparql(u'''SELECT ?uri
1081 +                                            WHERE{
1082 +                                            ?uri rdfs:label "Python"@en .
1083 +                                            ?uri rdf:type ?type}''',
1084 +                                        u'http://dbpedia.org/sparql')
1085 +        self.assertEqual(source.query_word('cubicweb'),
1086 +                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
1087 +                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
1088 +
1089 +    def test_nerdy_process(self):
1090 +        """ Test nerdy process """
1091 +        text = 'Hello everyone, this is   me speaking. And me.'
1092 +        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1093 +                                          'me': 'http://example.com/me'})
1094 +        nerdy = core.NerdyProcess((source,))
1095 +        named_entities = nerdy.process_text(text)
1096 +        self.assertEqual(named_entities,
1097 +                         [('http://example.com/everyone', None,
1098 +                           Token(word='everyone', start=6, end=14,
1099 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1100 +                          ('http://example.com/me', None,
1101 +                           Token(word='me', start=26, end=28,
1102 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1103 +                          ('http://example.com/me', None,
1104 +                           Token(word='me', start=43, end=45,
1105 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1106 +
1107 +    def test_nerdy_process_multisources(self):
1108 +        """ Test nerdy process """
1109 +        text = 'Hello everyone, this is   me speaking. And me.'
1110 +        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1111 +                                          'me': 'http://example.com/me'})
1112 +        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
1113 +        # Two sources, not unique
1114 +        nerdy = core.NerdyProcess((source1, source2))
1115 +        named_entities = nerdy.process_text(text)
1116 +        self.assertEqual(named_entities,
1117 +                         [('http://example.com/everyone', None,
1118 +                           Token(word='everyone', start=6, end=14,
1119 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1120 +                          ('http://example.com/me', None,
1121 +                           Token(word='me', start=26, end=28,
1122 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1123 +                          ('http://example2.com/me', None,
1124 +                           Token(word='me', start=26, end=28,
1125 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1126 +                          ('http://example.com/me', None,
1127 +                           Token(word='me', start=43, end=45,
1128 +                                           sentence=Sentence(indice=1, start=38, end=46))),
1129 +                          ('http://example2.com/me', None,
1130 +                           Token(word='me', start=43, end=45,
1131 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1132 +        # Two sources, unique
1133 +        nerdy = core.NerdyProcess((source1, source2), unique=True)
1134 +        named_entities = nerdy.process_text(text)
1135 +        self.assertEqual(named_entities,
1136 +                         [('http://example.com/everyone', None,
1137 +                           Token(word='everyone', start=6, end=14,
1138 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1139 +                          ('http://example.com/me', None,
1140 +                           Token(word='me', start=26, end=28,
1141 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1142 +                          ('http://example.com/me', None,
1143 +                           Token(word='me', start=43, end=45,
1144 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1145 +        # Two sources inversed, unique
1146 +        nerdy = core.NerdyProcess((source2, source1), unique=True)
1147 +        named_entities = nerdy.process_text(text)
1148 +        self.assertEqual(named_entities,
1149 +                         [('http://example.com/everyone', None,
1150 +                           Token(word='everyone', start=6, end=14,
1151 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1152 +                          ('http://example2.com/me', None,
1153 +                           Token(word='me', start=26, end=28,
1154 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1155 +                          ('http://example2.com/me', None,
1156 +                           Token(word='me', start=43, end=45,
1157 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1158 +
1159 +    def test_nerdy_process_add_sources(self):
1160 +        """ Test nerdy process """
1161 +        text = 'Hello everyone, this is   me speaking. And me.'
1162 +        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1163 +                                          'me': 'http://example.com/me'})
1164 +        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
1165 +        nerdy = core.NerdyProcess((source1,))
1166 +        named_entities = nerdy.process_text(text)
1167 +        self.assertEqual(named_entities,
1168 +                         [('http://example.com/everyone', None,
1169 +                           Token(word='everyone', start=6, end=14,
1170 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1171 +                          ('http://example.com/me', None,
1172 +                           Token(word='me', start=26, end=28,
1173 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1174 +                          ('http://example.com/me', None,
1175 +                           Token(word='me', start=43, end=45,
1176 +                                           sentence=Sentence(indice=1, start=38, end=46))),])
1177 +        # Two sources, not unique
1178 +        nerdy.add_ner_source(source2)
1179 +        named_entities = nerdy.process_text(text)
1180 +        self.assertEqual(named_entities,
1181 +                         [('http://example.com/everyone', None,
1182 +                           Token(word='everyone', start=6, end=14,
1183 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1184 +                          ('http://example.com/me', None,
1185 +                           Token(word='me', start=26, end=28,
1186 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1187 +                          ('http://example2.com/me', None,
1188 +                           Token(word='me', start=26, end=28,
1189 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1190 +                          ('http://example.com/me', None,
1191 +                           Token(word='me', start=43, end=45,
1192 +                                           sentence=Sentence(indice=1, start=38, end=46))),
1193 +                          ('http://example2.com/me', None,
1194 +                           Token(word='me', start=43, end=45,
1195 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1196 +
1197 +    def test_nerdy_process_preprocess(self):
1198 +        """ Test nerdy process """
1199 +        text = 'Hello Toto, this is   me speaking. And me.'
1200 +        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
1201 +                                          'me': 'http://example.com/me'})
1202 +        preprocessor = core.NerdyStopwordsFilterPreprocessor()
1203 +        nerdy = core.NerdyProcess((source,),
1204 +                                  preprocessors=(preprocessor,))
1205 +        named_entities = nerdy.process_text(text)
1206 +        self.assertEqual(named_entities, [('http://example.com/toto', None,
1207 +                                           Token(word='Toto', start=6, end=10,
1208 +                                                 sentence=Sentence(indice=0, start=0, end=34)))])
1209 +
1210 +    def test_nerdy_process_add_preprocess(self):
1211 +        """ Test nerdy process """
1212 +        text = 'Hello Toto, this is   me speaking. And me.'
1213 +        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
1214 +                                          'me': 'http://example.com/me'})
1215 +        preprocessor = core.NerdyStopwordsFilterPreprocessor()
1216 +        nerdy = core.NerdyProcess((source,),)
1217 +        named_entities = nerdy.process_text(text)
1218 +        self.assertEqual(named_entities,
1219 +                         [('http://example.com/toto', None,
1220 +                           Token(word='Toto', start=6, end=10,
1221 +                                 sentence=Sentence(indice=0, start=0, end=34))),
1222 +                          ('http://example.com/me', None,
1223 +                           Token(word='me', start=22, end=24,
1224 +                                 sentence=Sentence(indice=0, start=0, end=34))),
1225 +                          ('http://example.com/me', None,
1226 +                           Token(word='me', start=39, end=41,
1227 +                                 sentence=Sentence(indice=1, start=34, end=42)))])
1228 +        nerdy.add_preprocessors(preprocessor)
1229 +        named_entities = nerdy.process_text(text)
1230 +        self.assertEqual(named_entities, [('http://example.com/toto', None,
1231 +                                           Token(word='Toto', start=6, end=10,
1232 +                                                 sentence=Sentence(indice=0, start=0, end=34)))])
1233 +
1234 +    def test_nerdy_process_chained_word(self):
1235 +        """ Test nerdy process """
1236 +        text = 'Hello everyone me, this is   me speaking. And me.'
1237 +        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1238 +                                          'everyone me': 'http://example.com/everyone_me',
1239 +                                          'me': 'http://example.com/me'})
1240 +        nerdy = core.NerdyProcess((source,))
1241 +        named_entities = nerdy.process_text(text)
1242 +        self.assertEqual(named_entities,
1243 +                         [('http://example.com/everyone_me', None,
1244 +                           Token(word='everyone me', start=6, end=17,
1245 +                                 sentence=Sentence(indice=0, start=0, end=41))),
1246 +                          ('http://example.com/me', None,
1247 +                           Token(word='me', start=29, end=31,
1248 +                                 sentence=Sentence(indice=0, start=0, end=41))),
1249 +                          ('http://example.com/me', None,
1250 +                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
1251 +
1252 +
1253 +if __name__ == '__main__':
1254 +    unittest2.main()
1255 +
diff --git a/test/test_filter.py b/test/test_filter.py
@@ -0,0 +1,99 @@
1256 +# -*- coding:utf-8 -*-
1257 +#
1258 +# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1259 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1260 +#
1261 +# This program is free software: you can redistribute it and/or modify it under
1262 +# the terms of the GNU Lesser General Public License as published by the Free
1263 +# Software Foundation, either version 2.1 of the License, or (at your option)
1264 +# any later version.
1265 +#
1266 +# This program is distributed in the hope that it will be useful, but WITHOUT
1267 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1268 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1269 +# details.
1270 +#
1271 +# You should have received a copy of the GNU Lesser General Public License along
1272 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1273 +import unittest2
1274 +
1275 +from nerdy import core
1276 +from nerdy.tokenizer import Token, Sentence
1277 +
1278 +
1279 +class FilterTest(unittest2.TestCase):
1280 +    """ Test of filters """
1281 +
1282 +    def test_occurence_filter_min_occ(self):
1283 +        """ Test occurence filter """
1284 +        text = 'Hello everyone, this is   me speaking. And me.'
1285 +        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1286 +                                          'me': 'http://example.com/me'})
1287 +        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
1288 +        _filter = core.NerdyOccurenceFilter(min_occ=2)
1289 +        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
1290 +        named_entities = nerdy.process_text(text)
1291 +        self.assertEqual(named_entities,
1292 +                         [('http://example.com/me', None,
1293 +                           Token(word='me', start=26, end=28,
1294 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1295 +                          ('http://example2.com/me', None,
1296 +                           Token(word='me', start=26, end=28,
1297 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1298 +                          ('http://example.com/me', None,
1299 +                           Token(word='me', start=43, end=45,
1300 +                                           sentence=Sentence(indice=1, start=38, end=46))),
1301 +                          ('http://example2.com/me', None,
1302 +                           Token(word='me', start=43, end=45,
1303 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1304 +
1305 +    def test_occurence_filter_max_occ(self):
1306 +        """ Test occurence filter """
1307 +        text = 'Hello everyone, this is   me speaking. And me.'
1308 +        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1309 +                                          'me': 'http://example.com/me'})
1310 +        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
1311 +        _filter = core.NerdyOccurenceFilter(max_occ=1)
1312 +        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
1313 +        named_entities = nerdy.process_text(text)
1314 +        self.assertEqual(named_entities,
1315 +                         [('http://example.com/everyone', None,
1316 +                           Token(word='everyone', start=6, end=14,
1317 +                                           sentence=Sentence(indice=0, start=0, end=38))),])
1318 +
1319 +    def test_disambiguation_word_length(self):
1320 +        """ Test occurence filter """
1321 +        text = 'Hello toto tutu. And toto.'
1322 +        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
1323 +                                          'toto': 'http://example.com/toto'})
1324 +        _filter = core.NerdyDisambiguationWordParts()
1325 +        nerdy = core.NerdyProcess((source,), filters=(_filter,))
1326 +        named_entities = nerdy.process_text(text)
1327 +        self.assertEqual(named_entities,
1328 +                         [('http://example.com/toto_tutu', None,
1329 +                           Token(word='toto tutu', start=6, end=15,
1330 +                                 sentence=Sentence(indice=0, start=0, end=16))),
1331 +                          ('http://example.com/toto_tutu', None,
1332 +                           Token(word='toto', start=21, end=25,
1333 +                                 sentence=Sentence(indice=1, start=16, end=26)))])
1334 +
1335 +    def test_rules_filter(self):
1336 +        """ Test rules filter """
1337 +        text = 'Hello toto tutu. And toto.'
1338 +        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
1339 +                                          'toto': 'http://example.com/toto'})
1340 +        rules = {'http://example.com/toto': 'http://example.com/tata'}
1341 +        _filter = core.NerdyReplacementRulesFilter(rules)
1342 +        nerdy = core.NerdyProcess((source,), filters=(_filter,))
1343 +        named_entities = nerdy.process_text(text)
1344 +        self.assertEqual(named_entities,
1345 +                         [('http://example.com/toto_tutu', None,
1346 +                           Token(word='toto tutu', start=6, end=15,
1347 +                                 sentence=Sentence(indice=0, start=0, end=16))),
1348 +                          ('http://example.com/tata', None,
1349 +                           Token(word='toto', start=21, end=25,
1350 +                                 sentence=Sentence(indice=1, start=16, end=26)))])
1351 +
1352 +if __name__ == '__main__':
1353 +    unittest2.main()
1354 +
diff --git a/test/test_ner_dataio.py b/test/test_ner_dataio.py
@@ -0,0 +1,85 @@
1355 +# -*- coding:utf-8 -*-
1356 +#
1357 +# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1358 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1359 +#
1360 +# This program is free software: you can redistribute it and/or modify it under
1361 +# the terms of the GNU Lesser General Public License as published by the Free
1362 +# Software Foundation, either version 2.1 of the License, or (at your option)
1363 +# any later version.
1364 +#
1365 +# This program is distributed in the hope that it will be useful, but WITHOUT
1366 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1367 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1368 +# details.
1369 +#
1370 +# You should have received a copy of the GNU Lesser General Public License along
1371 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1372 +import unittest2
1373 +
1374 +from nerdy import dataio, core
1375 +
1376 +
1377 +class DataioTest(unittest2.TestCase):
1378 +    """ Test of dataio """
1379 +
1380 +    def test_sparql_query(self):
1381 +        results = dataio.sparql_query(query=u'''SELECT ?uri
1382 +                                                WHERE{
1383 +                                                ?uri rdfs:label "Python"@en .
1384 +                                                ?uri rdf:type ?type}''',
1385 +                                      endpoint=u'http://dbpedia.org/sparql')
1386 +        truth = [{u'uri':
1387 +                  {u'type': u'uri',
1388 +                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}},
1389 +                 {u'uri':
1390 +                  {u'type': u'uri',
1391 +                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}]
1392 +        self.assertEqual(results, truth)
1393 +
1394 +    def test_rql_url_query(self):
1395 +        results = dataio.rql_url_query('Any U LIMIT 1 WHERE X cwuri U, X name "apycot"',
1396 +                                       'http://www.cubicweb.org')
1397 +        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
1398 +
1399 +    def test_prettyprint(self):
1400 +        text = 'Hello everyone, this is   me speaking. And me.'
1401 +        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1402 +                                          'me': 'http://example.com/me'})
1403 +        nerdy = core.NerdyProcess((source,))
1404 +        named_entities = nerdy.process_text(text)
1405 +        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
1406 +        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
1407 +                                u'this is   <a href="http://example.com/me">me</a> speaking. '
1408 +                                u'And <a href="http://example.com/me">me</a>.'))
1409 +
1410 +    def test_prettyprint_class(self):
1411 +        text = 'Hello everyone, this is   me speaking. And me.'
1412 +        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1413 +                                          'me': 'http://example.com/me'})
1414 +        nerdy = core.NerdyProcess((source,))
1415 +        named_entities = nerdy.process_text(text)
1416 +        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
1417 +        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
1418 +                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
1419 +                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
1420 +
1421 +
1422 +class NerdyValidXHTMLPrettyPrintTest(unittest2.TestCase):
1423 +
1424 +    def test_valid(self):
1425 +        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
1426 +            '<p>coucou</p>'))
1427 +
1428 +    def test_valid_unicode(self):
1429 +        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
1430 +            u'<p>hé</p>'))
1431 +
1432 +    def test_invalid(self):
1433 +        self.assertFalse(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
1434 +            '<p><div>coucou</div></p>'))
1435 +
1436 +
1437 +if __name__ == '__main__':
1438 +    unittest2.main()
1439 +
diff --git a/test/test_preprocessor.py b/test/test_preprocessor.py
@@ -0,0 +1,97 @@
1440 +# -*- coding:utf-8 -*-
1441 +#
1442 +# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1443 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1444 +#
1445 +# This program is free software: you can redistribute it and/or modify it under
1446 +# the terms of the GNU Lesser General Public License as published by the Free
1447 +# Software Foundation, either version 2.1 of the License, or (at your option)
1448 +# any later version.
1449 +#
1450 +# This program is distributed in the hope that it will be useful, but WITHOUT
1451 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1452 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1453 +# details.
1454 +#
1455 +# You should have received a copy of the GNU Lesser General Public License along
1456 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1457 +import unittest2
1458 +
1459 +from nerdy import core, tokenizer
1460 +
1461 +
1462 +class PreprocessorTest(unittest2.TestCase):
1463 +    """ Test of preprocessors """
1464 +
1465 +    def test_lowercasefilter(self):
1466 +        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
1467 +        token = tokenizer.Token('toto', 0, 4, None)
1468 +        self.assertEqual(preprocessor(token), None)
1469 +        token = tokenizer.Token('toto Tata', 0, 4, None)
1470 +        self.assertEqual(preprocessor(token), token)
1471 +        token = tokenizer.Token('toto tata', 0, 4, None)
1472 +        self.assertEqual(preprocessor(token), None)
1473 +
1474 +    def test_wordsizefilter(self):
1475 +        preprocessor = core.NerdyWordSizeFilterPreprocessor()
1476 +        token = tokenizer.Token('toto', 0, 4, None)
1477 +        self.assertEqual(preprocessor(token), token)
1478 +        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
1479 +        token = tokenizer.Token('toto', 0, 4, None)
1480 +        self.assertEqual(preprocessor(token), token)
1481 +        token = tokenizer.Token('to', 0, 4, None)
1482 +        self.assertEqual(preprocessor(token), None)
1483 +        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
1484 +        token = tokenizer.Token('toto', 0, 4, None)
1485 +        self.assertEqual(preprocessor(token), None)
1486 +        token = tokenizer.Token('to', 0, 4, None)
1487 +        self.assertEqual(preprocessor(token), token)
1488 +
1489 +    def test_lowerfirstword(self):
1490 +        preprocessor = core.NerdyLowerFirstWordPreprocessor()
1491 +        sentence = tokenizer.Sentence(0, 0, 20)
1492 +        # Start of the sentence
1493 +        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
1494 +        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
1495 +        self.assertEqual(preprocessor(token1), token2)
1496 +        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
1497 +        token2 = tokenizer.Token('us tata', 0, 4, sentence)
1498 +        self.assertEqual(preprocessor(token1), token2)
1499 +        # Not start of the sentence
1500 +        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
1501 +        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
1502 +        self.assertEqual(preprocessor(token1), token2)
1503 +        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
1504 +        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
1505 +        self.assertEqual(preprocessor(token1), token2)
1506 +
1507 +    def test_stopwordsfilter(self):
1508 +        preprocessor = core.NerdyStopwordsFilterPreprocessor()
1509 +        token = tokenizer.Token('Toto', 0, 4, None)
1510 +        self.assertEqual(preprocessor(token), token)
1511 +        token = tokenizer.Token('Us', 0, 4, None)
1512 +        self.assertEqual(preprocessor(token), None)
1513 +        token = tokenizer.Token('Us there', 0, 4, None)
1514 +        self.assertEqual(preprocessor(token), token)
1515 +        # Split words
1516 +        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
1517 +        token = tokenizer.Token('Us there', 0, 4, None)
1518 +        self.assertEqual(preprocessor(token), None)
1519 +        token = tokenizer.Token('Us there toto', 0, 4, None)
1520 +        self.assertEqual(preprocessor(token), token)
1521 +
1522 +    def test_hashtag(self):
1523 +        preprocessor = core.NerdyHashTagPreprocessor()
1524 +        token = tokenizer.Token('Toto', 0, 4, None)
1525 +        self.assertEqual(preprocessor(token), token)
1526 +        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
1527 +        token2 = tokenizer.Token('BarackObama', 0, 4, None)
1528 +        self.assertEqual(preprocessor(token1), token2)
1529 +        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
1530 +        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
1531 +        self.assertEqual(preprocessor(token1), token2)
1532 +
1533 +
1534 +if __name__ == '__main__':
1535 +    unittest2.main()
1536 +
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -0,0 +1,88 @@
1537 +# -*- coding:utf-8 -*-
1538 +#
1539 +# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1540 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1541 +#
1542 +# This program is free software: you can redistribute it and/or modify it under
1543 +# the terms of the GNU Lesser General Public License as published by the Free
1544 +# Software Foundation, either version 2.1 of the License, or (at your option)
1545 +# any later version.
1546 +#
1547 +# This program is distributed in the hope that it will be useful, but WITHOUT
1548 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1549 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1550 +# details.
1551 +#
1552 +# You should have received a copy of the GNU Lesser General Public License along
1553 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1554 +import unittest2
1555 +
1556 +from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
1557 +
1558 +
1559 +class TokenizerTest(unittest2.TestCase):
1560 +    """ Test of tokenizer """
1561 +
1562 +    def test_richstringtokenizer(self):
1563 +        text = 'Hello everyone, this is   me speaking. And me.'
1564 +        tokenizer = RichStringTokenizer(text,
1565 +                                        token_min_size=1,
1566 +                                        token_max_size=3)
1567 +        tokens = list(tokenizer)
1568 +        self.assertEqual(len(tokens), 18)
1569 +        t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
1570 +        self.assertEqual(tokens[0], t1)
1571 +        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
1572 +        self.assertEqual(tokens[16], t2)
1573 +
1574 +    def test_richstringtokenizer_loadtext(self):
1575 +        text = 'Hello everyone, this is   me speaking. And me.'
1576 +        tokenizer = RichStringTokenizer(text,
1577 +                                        token_min_size=1,
1578 +                                        token_max_size=3)
1579 +        tokens = list(tokenizer)
1580 +        self.assertEqual(len(tokens), 18)
1581 +        tokenizer.load_text('Hello everyone')
1582 +        tokens = list(tokenizer)
1583 +        self.assertEqual(len(tokens), 3)
1584 +
1585 +    def test_richstringtokenizer_minsize(self):
1586 +        text = 'Hello everyone, this is   me speaking. And me.'
1587 +        tokenizer = RichStringTokenizer(text,
1588 +                                        token_min_size=2,
1589 +                                        token_max_size=3)
1590 +        tokens = list(tokenizer)
1591 +        self.assertEqual(len(tokens), 10)
1592 +        t1 =  Token(word='me speaking', start=26, end=37, sentence=Sentence(indice=0, start=0, end=38))
1593 +        self.assertEqual(tokens[8], t1)
1594 +
1595 +    def test_richstringtokenizer_maxsize(self):
1596 +        text = 'Hello everyone, this is   me speaking. And me.'
1597 +        tokenizer = RichStringTokenizer(text,
1598 +                                        token_min_size=1,
1599 +                                        token_max_size=4)
1600 +        tokens = list(tokenizer)
1601 +        self.assertEqual(len(tokens), 21)
1602 +        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
1603 +        self.assertEqual(tokens[18], t1)
1604 +
1605 +    def test_richstringtokenizer_sentences(self):
1606 +        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
1607 +        tokenizer = RichStringTokenizer(text,
1608 +                                        token_min_size=1,
1609 +                                        token_max_size=4)
1610 +        sentences = tokenizer.find_sentences(text)
1611 +        self.assertEqual(len(sentences), 4)
1612 +        self.assertEqual(text[sentences[0].start:sentences[0].end],
1613 +                         'Hello everyone, this is   me speaking.')
1614 +        self.assertEqual(text[sentences[1].start:sentences[1].end],
1615 +                         ' And me !')
1616 +        self.assertEqual(text[sentences[2].start:sentences[2].end],
1617 +                         'Why not me ?')
1618 +        self.assertEqual(text[sentences[3].start:sentences[3].end],
1619 +                         ' Blup')
1620 +
1621 +
1622 +if __name__ == '__main__':
1623 +    unittest2.main()
1624 +