[rename] Rename modules with shorter names, related to #187461

authorVincent Michel <vincent.michel@logilab.fr>
changeset39ce88665867
branchdefault
phasedraft
hiddenyes
parent revision#6a0b643b9e78 [named entities] Split core into preprocessors and filters modules, related to #187461
child revision#14a80f3aea13 [data] Move french lemmas in data module, related to #187461
files modified by this revision
__pkginfo__.py
data/__init__.py
data/countries.py
data/countries_iso_3166.txt
data/stopwords.py
data/us_states.py
demo.py
examples/demo.py
named_entities/__init__.py
named_entities/filters.py
named_entities/preprocessors.py
named_entities/sources.py
ner/__init__.py
ner/filters.py
ner/preprocessors.py
ner/sources.py
record_linkage/__init__.py
record_linkage/aligner.py
record_linkage/blocking.py
reference_data/__init__.py
reference_data/countries.py
reference_data/countries_iso_3166.txt
reference_data/stopwords.py
reference_data/us_states.py
rl/__init__.py
rl/aligner.py
rl/blocking.py
test/test_alignment.py
test/test_blocking.py
test/test_dataio.py
test/test_filters.py
test/test_named_entities.py
test/test_ner.py
test/test_preprocessors.py
# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1387464356 0
# Thu Dec 19 14:45:56 2013 +0000
# Node ID 39ce8866586755202a4e07ef8226d808fe1168e8
# Parent 6a0b643b9e78124b989162b20c3327dca0953a1e
[rename] Rename modules with shorter names, related to #187461

diff --git a/__pkginfo__.py b/__pkginfo__.py
@@ -33,9 +33,9 @@
1  author_email = "contact@logilab.fr"
2 
3 
4  from os.path import join
5  scripts = [join('bin', 'pytest')]
6 -include_dirs = [join('test', 'data', 'utils', 'named_entities', 'record_linkage', 'reference_data', 'examples')]
7 +include_dirs = [join('test', 'data', 'utils', 'ner', 'rl', 'examples')]
8 
9  if sys.version_info < (2, 7):
10      install_requires = ['unittest2 >= 0.5.1']
diff --git a/data/__init__.py b/data/__init__.py
diff --git a/data/countries.py b/data/countries.py
@@ -0,0 +1,994 @@
11 +
12 +# Countries list (ISO-3166)
13 +COUNTRIES = {'##': 'non renseign\xc3\xa9',
14 +             '..': 'non renseign\xc3\xa9',
15 +             'aa': 'aire g\xc3\xa9ographique ancienne',
16 +             'ad': 'Andorre',
17 +             'ae': '\xc3\x89mirats arabes unis',
18 +             'af': 'Afghanistan',
19 +             'ag': 'Antigua-et-Barbuda',
20 +             'ai': 'Anguilla',
21 +             'al': 'Albanie',
22 +             'am': 'Arm\xc3\xa9nie',
23 +             'an': 'Antilles n\xc3\xa9erlandaises',
24 +             'ao': 'Angola',
25 +             'aq': 'Antarctique',
26 +             'ar': 'Argentine',
27 +             'as': 'Samoa am\xc3\xa9ricaines',
28 +             'at': 'Autriche',
29 +             'au': 'Australie',
30 +             'aw': 'Aruba',
31 +             'ax': 'Aland (\xc3\xaeles)',
32 +             'az': 'Azerba\xc3\xafdjan',
33 +             'ba': 'Bosnie-Herz\xc3\xa9govine',
34 +             'bb': 'Barbade',
35 +             'bd': 'Bangladesh',
36 +             'be': 'Belgique',
37 +             'bf': 'Burkina',
38 +             'bg': 'Bulgarie',
39 +             'bh': 'Bahre\xc3\xafn',
40 +             'bi': 'Burundi',
41 +             'bj': 'B\xc3\xa9nin',
42 +             'bl': 'Saint-Barth\xc3\xa9lemy',
43 +             'bm': 'Bermudes',
44 +             'bn': 'Brun\xc3\xa9i',
45 +             'bo': 'Bolivie',
46 +             'bq': 'Bonaire, Saint-Eustache et Saba,Saba###Saint-Eustache',
47 +             'br': 'Br\xc3\xa9sil',
48 +             'bs': 'Bahamas',
49 +             'bt': 'Bhoutan',
50 +             'bv': 'Bouvet (\xc3\xaele)',
51 +             'bw': 'Botswana',
52 +             'by': 'Bi\xc3\xa9lorussie,B\xc3\xa9larus',
53 +             'bz': 'Belize',
54 +             'ca': 'Canada',
55 +             'cc': 'Cocos (\xc3\xaeles),Keeling (\xc3\xaeles)',
56 +             'cd': 'Congo (R\xc3\xa9publique d\xc3\xa9mocratique),Za\xc3\xafre',
57 +             'cf': 'Centrafrique,R\xc3\xa9publique centrafricaine',
58 +             'cg': 'Congo,Congo (R\xc3\xa9publique)',
59 +             'ch': 'Suisse,Conf\xc3\xa9d\xc3\xa9ration helv\xc3\xa9tique',
60 +             'ci': "C\xc3\xb4te d'Ivoire\n",
61 +             'ck': 'Cook (\xc3\xaeles)',
62 +             'cl': 'Chili',
63 +             'cm': 'Cameroun',
64 +             'cn': 'Chine,Chine (R\xc3\xa9publique populaire)',
65 +             'co': 'Colombie',
66 +             'cr': 'Costa Rica',
67 +             'cs': 'Serbie-et-Mont\xc3\xa9n\xc3\xa9gro',
68 +             'cu': 'Cuba',
69 +             'cv': 'Cap-Vert',
70 +             'cw': 'Cura\xc3\xa7ao',
71 +             'cx': 'Christmas (\xc3\xaele)',
72 +             'cy': 'Chypre',
73 +             'cz': 'R\xc3\xa9publique tch\xc3\xa8que,Tch\xc3\xa8que, R\xc3\xa9publique',
74 +             'dd': 'Allemagne (R\xc3\xa9publique d\xc3\xa9mocratique)',
75 +             'de': 'Allemagne,Allemagne (R\xc3\xa9publique f\xc3\xa9d\xc3\xa9rale)',
76 +             'dj': 'Djibouti',
77 +             'dk': 'Danemark',
78 +             'dm': 'Dominique',
79 +             'do': 'R\xc3\xa9publique dominicaine,Dominicaine, R\xc3\xa9publique',
80 +             'dz': 'Alg\xc3\xa9rie',
81 +             'ec': '\xc3\x89quateur',
82 +             'ee': 'Estonie',
83 +             'eg': '\xc3\x89gypte',
84 +             'eh': 'Sahara occidental',
85 +             'er': '\xc3\x89rythr\xc3\xa9e',
86 +             'es': 'Espagne',
87 +             'et': '\xc3\x89thiopie',
88 +             'fi': 'Finlande',
89 +             'fj': 'Fidji',
90 +             'fk': 'Malouines (\xc3\xaeles),Falkland (\xc3\xaeles)',
91 +             'fm': 'Micron\xc3\xa9sie,\xc3\x89tats f\xc3\xa9d\xc3\xa9r\xc3\xa9s de Micron\xc3\xa9sie',
92 +             'fo': 'F\xc3\xa9ro\xc3\xa9 (\xc3\xaeles)',
93 +             'fr': 'France',
94 +             'ga': 'Gabon',
95 +             'gb': 'Grande-Bretagne,Royaume-Uni',
96 +             'gd': 'Grenade',
97 +             'ge': 'G\xc3\xa9orgie',
98 +             'gf': 'Guyane fran\xc3\xa7aise',
99 +             'gg': 'Guernesey',
100 +             'gh': 'Ghana',
101 +             'gi': 'Gibraltar',
102 +             'gl': 'Groenland',
103 +             'gm': 'Gambie',
104 +             'gn': 'Guin\xc3\xa9e',
105 +             'gp': 'Guadeloupe',
106 +             'gq': 'Guin\xc3\xa9e \xc3\xa9quatoriale',
107 +             'gr': 'Gr\xc3\xa8ce',
108 +             'gs': 'G\xc3\xa9orgie du Sud et les \xc3\xaeles Sandwich du Sud',
109 +             'gt': 'Guatemala',
110 +             'gu': 'Guam',
111 +             'gw': 'Guin\xc3\xa9e-Bissau',
112 +             'gy': 'Guyana',
113 +             'hk': 'Hong Kong',
114 +             'hm': 'Heard (\xc3\xaele) et \xc3\xaeles McDonald',
115 +             'hn': 'Honduras',
116 +             'hr': 'Croatie',
117 +             'ht': 'Ha\xc3\xafti',
118 +             'hu': 'Hongrie',
119 +             'id': 'Indon\xc3\xa9sie',
120 +             'ie': 'Irlande',
121 +             'ii': 'intergouvernemental',
122 +             'il': 'Isra\xc3\xabl',
123 +             'im': '\xc3\x8ele de Man,Man, \xc3\x8ele de',
124 +             'in': 'Inde',
125 +             'io': "Territoire britannique de l'Oc\xc3\xa9an indien,Chagos (\xc3\xaeles)###Oc\xc3\xa9an indien, Territoire britannique de l'\n",
126 +             'iq': 'Irak',
127 +             'ir': 'Iran',
128 +             'is': 'Islande',
129 +             'it': 'Italie',
130 +             'je': 'Jersey',
131 +             'jm': 'Jama\xc3\xafque',
132 +             'jo': 'Jordanie',
133 +             'jp': 'Japon',
134 +             'ke': 'Kenya',
135 +             'kg': 'Kirghizistan',
136 +             'kh': 'Cambodge',
137 +             'ki': 'Kiribati',
138 +             'km': 'Comores',
139 +             'kn': 'Saint-Kitts-et-Nevis,Saint-Christophe-et-Nevis',
140 +             'ko': 'Kosovo',
141 +             'kp': 'Cor\xc3\xa9e (R\xc3\xa9publique populaire d\xc3\xa9mocratique),Cor\xc3\xa9e du Nord',
142 +             'kr': 'Cor\xc3\xa9e (R\xc3\xa9publique),Cor\xc3\xa9e du Sud',
143 +             'kw': 'Kowe\xc3\xaft',
144 +             'ky': 'Cayman,Ca\xc3\xafmanes, \xc3\x8eles###Ca\xc3\xafman (\xc3\xaeles)',
145 +             'kz': 'Kazakhstan',
146 +             'la': 'Laos',
147 +             'lb': 'Liban',
148 +             'lc': 'Sainte-Lucie',
149 +             'li': 'Liechtenstein',
150 +             'lk': 'Sri Lanka',
151 +             'lr': 'Liberia',
152 +             'ls': 'Lesotho',
153 +             'lt': 'Lituanie',
154 +             'lu': 'Luxembourg',
155 +             'lv': 'Lettonie',
156 +             'ly': 'Libye',
157 +             'ma': 'Maroc',
158 +             'mc': 'Monaco',
159 +             'md': 'Moldavie,Moldova, R\xc3\xa9publique de',
160 +             'me': 'Mont\xc3\xa9n\xc3\xa9gro',
161 +             'mf': 'Saint-Martin (partie fran\xc3\xa7aise)',
162 +             'mg': 'Madagascar',
163 +             'mh': 'Marshall (\xc3\xaeles)',
164 +             'mk': 'Mac\xc3\xa9doine (R\xc3\xa9publique)',
165 +             'ml': 'Mali',
166 +             'mm': 'Myanmar,Birmanie',
167 +             'mn': 'Mongolie',
168 +             'mo': 'Macao',
169 +             'mp': 'Mariannes du Nord (\xc3\xaeles)',
170 +             'mq': 'Martinique',
171 +             'mr': 'Mauritanie',
172 +             'ms': 'Montserrat',
173 +             'mt': 'Malte',
174 +             'mu': 'Maurice',
175 +             'mv': 'Maldives',
176 +             'mw': 'Malawi',
177 +             'mx': 'Mexique',
178 +             'my': 'Malaisie',
179 +             'mz': 'Mozambique',
180 +             'na': 'Namibie',
181 +             'nc': 'Nouvelle-Cal\xc3\xa9donie',
182 +             'ne': 'Niger',
183 +             'nf': 'Norfolk (\xc3\xaele)',
184 +             'ng': 'Nigeria',
185 +             'ni': 'Nicaragua',
186 +             'nl': 'Pays-Bas',
187 +             'no': 'Norv\xc3\xa8ge',
188 +             'np': 'N\xc3\xa9pal',
189 +             'nr': 'Nauru',
190 +             'nu': 'Niue',
191 +             'nz': 'Nouvelle-Z\xc3\xa9lande',
192 +             'om': 'Oman',
193 +             'oo': 'code non adapt\xc3\xa9',
194 +             'pa': 'Panama',
195 +             'pe': 'P\xc3\xa9rou',
196 +             'pf': 'Polyn\xc3\xa9sie fran\xc3\xa7aise',
197 +             'pg': 'Papouasie-Nouvelle-Guin\xc3\xa9e',
198 +             'ph': 'Philippines',
199 +             'pk': 'Pakistan',
200 +             'pl': 'Pologne',
201 +             'pm': 'Saint-Pierre-et-Miquelon',
202 +             'pn': 'Pitcairn',
203 +             'pr': 'Porto Rico',
204 +             'ps': 'Autorit\xc3\xa9 palestinienne,Palestine',
205 +             'pt': 'Portugal',
206 +             'pw': 'Palau,Palaos',
207 +             'py': 'Paraguay',
208 +             'qa': 'Qatar',
209 +             're': 'R\xc3\xa9union',
210 +             'ro': 'Roumanie',
211 +             'rs': 'Serbie',
212 +             'ru': 'Russie (F\xc3\xa9d\xc3\xa9ration),Russie',
213 +             'rw': 'Rwanda',
214 +             'sa': 'Arabie saoudite',
215 +             'sb': 'Salomon (\xc3\xaeles)',
216 +             'sc': 'Seychelles',
217 +             'sd': 'Soudan',
218 +             'se': 'Su\xc3\xa8de',
219 +             'sg': 'Singapour',
220 +             'sh': 'Sainte-H\xc3\xa9l\xc3\xa8ne,Ascension (\xc3\xaele)###Tristan da Cunha (\xc3\xaele)',
221 +             'si': 'Slov\xc3\xa9nie',
222 +             'sj': 'Svalbard et \xc3\xaele Jan Mayen',
223 +             'sk': 'Slovaquie',
224 +             'sl': 'Sierra Leone',
225 +             'sm': 'Saint-Marin',
226 +             'sn': 'S\xc3\xa9n\xc3\xa9gal',
227 +             'so': 'Somalie',
228 +             'sr': 'Suriname',
229 +             'ss': 'Soudan du Sud,Sud Soudan',
230 +             'st': 'Sao Tom\xc3\xa9-et-Principe',
231 +             'su': 'URSS',
232 +             'sv': 'El Salvador,Salvador',
233 +             'sx': 'Saint-Martin (partie n\xc3\xa9erlandaise),Sint Maarten',
234 +             'sy': 'Syrie',
235 +             'sz': 'Swaziland',
236 +             'tc': 'Turks et Ca\xc3\xafques (\xc3\xaeles)',
237 +             'td': 'Tchad',
238 +             'tf': 'Terres australes fran\xc3\xa7aises',
239 +             'tg': 'Togo',
240 +             'th': 'Tha\xc3\xaflande',
241 +             'tj': 'Tadjikistan',
242 +             'tk': 'Tokelau',
243 +             'tl': 'Timor oriental',
244 +             'tm': 'Turkm\xc3\xa9nistan',
245 +             'tn': 'Tunisie',
246 +             'to': 'Tonga',
247 +             'tr': 'Turquie',
248 +             'tt': 'Trinit\xc3\xa9-et-Tobago',
249 +             'tv': 'Tuvalu',
250 +             'tw': 'Ta\xc3\xafwan,Chine (R\xc3\xa9publique)',
251 +             'tz': 'Tanzanie',
252 +             'ua': 'Ukraine',
253 +             'ug': 'Ouganda',
254 +             'um': '\xc3\x8eles mineures \xc3\xa9loign\xc3\xa9es des \xc3\x89tats-Unis',
255 +             'us': '\xc3\x89tats-Unis',
256 +             'uy': 'Uruguay',
257 +             'uz': 'Ouzb\xc3\xa9kistan',
258 +             'va': 'Vatican,Saint-Si\xc3\xa8ge',
259 +             'vc': 'Saint-Vincent-et-les Grenadines',
260 +             've': 'Venezuela',
261 +             'vg': '\xc3\x8eles Vierges britanniques,Vierges (\xc3\xaeles) britanniques',
262 +             'vi': '\xc3\x8eles Vierges am\xc3\xa9ricaines,Vierges (\xc3\xaeles) am\xc3\xa9ricaines',
263 +             'vn': 'Viet Nam',
264 +             'vu': 'Vanuatu',
265 +             'wf': 'Wallis et Futuna (\xc3\xaeles)',
266 +             'ws': 'Samoa,Samoa occidentales',
267 +             'xc': 'Tch\xc3\xa9coslovaquie',
268 +             'xd': 'Allemagne avant 1945',
269 +             'xe': 'Europe,Union europ\xc3\xa9enne',
270 +             'xk': 'Cor\xc3\xa9e avant 1948',
271 +             'xn': 'Pays-Bas avant 1830,Belgique avant 1830',
272 +             'xx': 'inconnu',
273 +             'yd': 'Y\xc3\xa9men (R\xc3\xa9publique d\xc3\xa9mocratique populaire),Sud Y\xc3\xa9men',
274 +             'ye': 'Y\xc3\xa9men',
275 +             'yt': 'Mayotte',
276 +             'yu': 'Yougoslavie',
277 +             'yy': "ne s'applique pas\n",
278 +             'za': 'Afrique du Sud',
279 +             'zm': 'Zambie',
280 +             'zw': 'Zimbabwe',
281 +             'zz': 'multiple\n'}
282 +
283 +
284 +# REGIONS TO COUNTRIES MAPPING
285 +REGIONS_TO_COUNTRIES = {u'Abruzzes': u'Italie',
286 +                        u'Acha\xefe': u'Gr\xe8ce',
287 +                        u'Acre': u'Br\xe9sil',
288 +                        u'Afghanistan': u'Afghanistan',
289 +                        u'Afrique du Sud': u'Afrique du Sud',
290 +                        u'Aguascalientes': u'Mexique',
291 +                        u'Ain': u'France',
292 +                        u'Aisne': u'France',
293 +                        u'Alabama': u'\xc9tats-Unis',
294 +                        u'Alagoas': u'Br\xe9sil',
295 +                        u'Aland (\xeeles)': u'Aland (\xeeles)',
296 +                        u'Alaska': u'\xc9tats-Unis',
297 +                        u'Albanie': u'Albanie',
298 +                        u'Alberta': u'Canada',
299 +                        u'Alg\xe9rie': u'Alg\xe9rie',
300 +                        u'Allemagne': u'Allemagne',
301 +                        u'Allemagne (R\xe9publique d\xe9mocratique)': u'Allemagne (R\xe9publique d\xe9mocratique)',
302 +                        u'Allemagne avant 1945': u'Allemagne avant 1945',
303 +                        u'Allier': u'France',
304 +                        u'Alpes-Maritimes': u'France',
305 +                        u'Alpes-de-Haute-Provence': u'France',
306 +                        u'Alsace': u'France',
307 +                        u'Amapa': u'Br\xe9sil',
308 +                        u'Amazonas': u'Br\xe9sil',
309 +                        u'Andalousie': u'Espagne',
310 +                        u'Andorre': u'Andorre',
311 +                        u'Angola': u'Angola',
312 +                        u'Anguilla': u'Anguilla',
313 +                        u'Antarctique': u'Antarctique',
314 +                        u'Antigua-et-Barbuda': u'Antigua-et-Barbuda',
315 +                        u'Antilles n\xe9erlandaises': u'Antilles n\xe9erlandaises',
316 +                        u'Anvers': u'Belgique',
317 +                        u'Appenzell-Rhodes-Ext\xe9rieures': u'Suisse',
318 +                        u'Appenzell-Rhodes-Int\xe9rieures': u'Suisse',
319 +                        u'Aquitaine': u'France',
320 +                        u'Arabie saoudite': u'Arabie saoudite',
321 +                        u'Aragon': u'Espagne',
322 +                        u'Arcadie': u'Gr\xe8ce',
323 +                        u'Ardennes': u'France',
324 +                        u'Ard\xe8che': u'France',
325 +                        u'Argentine': u'Argentine',
326 +                        u'Argolide': u'Gr\xe8ce',
327 +                        u'Argovie': u'Suisse',
328 +                        u'Arizona': u'\xc9tats-Unis',
329 +                        u'Ari\xe8ge': u'France',
330 +                        u'Arkansas': u'\xc9tats-Unis',
331 +                        u'Arm\xe9nie': u'Arm\xe9nie',
332 +                        u'Aruba': u'Aruba',
333 +                        u'Asturies': u'Espagne',
334 +                        u'Ath\xe8nes et agglom\xe9ration': u'Gr\xe8ce',
335 +                        u'Attique': u'Gr\xe8ce',
336 +                        u'Aube': u'France',
337 +                        u'Aude': u'France',
338 +                        u'Australie': u'Australie',
339 +                        u'Australie-M\xe9ridionale': u'Australie',
340 +                        u'Australie-Occidentale': u'Australie',
341 +                        u'Autorit\xe9 palestinienne': u'Autorit\xe9 palestinienne',
342 +                        u'Autriche': u'Autriche',
343 +                        u'Auvergne': u'France',
344 +                        u'Aveyron': u'France',
345 +                        u'Azerba\xefdjan': u'Azerba\xefdjan',
346 +                        u'Bade-Wurtemberg': u'Allemagne',
347 +                        u'Bahamas': u'Bahamas',
348 +                        u'Bahia': u'Br\xe9sil',
349 +                        u'Bahre\xefn': u'Bahre\xefn',
350 +                        u'Baja California Norte': u'Mexique',
351 +                        u'Baja California Sur': u'Mexique',
352 +                        u'Bangladesh': u'Bangladesh',
353 +                        u'Barbade': u'Barbade',
354 +                        u'Bas-Rhin': u'France',
355 +                        u'Basilicate': u'Italie',
356 +                        u'Basse-Autriche': u'Autriche',
357 +                        u'Basse-Normandie': u'France',
358 +                        u'Basse-Saxe': u'Allemagne',
359 +                        u'Bavi\xe8re': u'Allemagne',
360 +                        u'Belgique': u'Belgique',
361 +                        u'Belize': u'Belize',
362 +                        u'Berlin': u'Allemagne',
363 +                        u'Bermudes': u'Bermudes',
364 +                        u'Berne': u'Suisse',
365 +                        u'Bhoutan': u'Bhoutan',
366 +                        u'Bi\xe9lorussie': u'Bi\xe9lorussie',
367 +                        u'Bolivie': u'Bolivie',
368 +                        u'Bonaire, Saint-Eustache et Saba': u'Bonaire, Saint-Eustache et Saba',
369 +                        u'Bosnie-Herz\xe9govine': u'Bosnie-Herz\xe9govine',
370 +                        u'Botswana': u'Botswana',
371 +                        u'Bouches-du-Rh\xf4ne': u'France',
372 +                        u'Bourgogne': u'France',
373 +                        u'Bouvet (\xeele)': u'Bouvet (\xeele)',
374 +                        u'Brabant': u'Belgique',
375 +                        u'Brabant flamand': u'Belgique',
376 +                        u'Brabant wallon': u'Belgique',
377 +                        u'Brabant-Septentrional': u'Pays-Bas',
378 +                        u'Brandebourg': u'Allemagne',
379 +                        u'Bretagne': u'France',
380 +                        u'Brun\xe9i': u'Brun\xe9i',
381 +                        u'Bruxelles': u'Belgique',
382 +                        u'Br\xe9sil': u'Br\xe9sil',
383 +                        u'Br\xeame': u'Allemagne',
384 +                        u'Buenos Aires': u'Argentine',
385 +                        u'Bulgarie': u'Bulgarie',
386 +                        u'Burgenland': u'Autriche',
387 +                        u'Burkina': u'Burkina',
388 +                        u'Burundi': u'Burundi',
389 +                        u'B\xe2le-Campagne': u'Suisse',
390 +                        u'B\xe2le-Ville': u'Suisse',
391 +                        u'B\xe9nin': u'B\xe9nin',
392 +                        u'B\xe9otie': u'Gr\xe8ce',
393 +                        u'Calabre': u'Italie',
394 +                        u'Californie': u'\xc9tats-Unis',
395 +                        u'Calvados': u'France',
396 +                        u'Cambodge': u'Cambodge',
397 +                        u'Cameroun': u'Cameroun',
398 +                        u'Campanie': u'Italie',
399 +                        u'Campeche': u'Mexique',
400 +                        u'Canada': u'Canada',
401 +                        u'Canaries': u'Espagne',
402 +                        u'Cantabrie': u'Espagne',
403 +                        u'Cantal': u'France',
404 +                        u'Cap-Vert': u'Cap-Vert',
405 +                        u'Capitale f\xe9d\xe9rale': u'Argentine',
406 +                        u'Carinthie': u'Autriche',
407 +                        u'Caroline du Nord': u'\xc9tats-Unis',
408 +                        u'Caroline du Sud': u'\xc9tats-Unis',
409 +                        u'Castille et L\xe9on': u'Espagne',
410 +                        u'Castille-la Manche': u'Espagne',
411 +                        u'Catalogne': u'Espagne',
412 +                        u'Catamarca': u'Argentine',
413 +                        u'Cayman': u'Cayman',
414 +                        u'Cear\xe1': u'Br\xe9sil',
415 +                        u'Centrafrique': u'Centrafrique',
416 +                        u'Centre': u'France',
417 +                        u'Ceuta': u'Espagne',
418 +                        u'Chaco': u'Argentine',
419 +                        u'Chalcidique': u'Gr\xe8ce',
420 +                        u'Champagne-Ardenne': u'France',
421 +                        u'Charente': u'France',
422 +                        u'Charente-Maritime': u'France',
423 +                        u'Cher': u'France',
424 +                        u'Chiapas': u'Mexique',
425 +                        u'Chihuahua': u'Mexique',
426 +                        u'Chili': u'Chili',
427 +                        u'Chine': u'Chine',
428 +                        u'Christmas (\xeele)': u'Christmas (\xeele)',
429 +                        u'Chubut': u'Argentine',
430 +                        u'Chypre': u'Chypre',
431 +                        u'Ch\xedos': u'Gr\xe8ce',
432 +                        u'Coahuila': u'Mexique',
433 +                        u'Cocos (\xeeles)': u'Cocos (\xeeles)',
434 +                        u'Colima': u'Mexique',
435 +                        u'Colombie': u'Colombie',
436 +                        u'Colombie britannique': u'Canada',
437 +                        u'Colorado': u'\xc9tats-Unis',
438 +                        u'Communaut\xe9 de Madrid': u'Espagne',
439 +                        u'Communaut\xe9 de Valence': u'Espagne',
440 +                        u'Comores': u'Comores',
441 +                        u'Congo': u'Congo',
442 +                        u'Congo (R\xe9publique d\xe9mocratique)': u'Congo (R\xe9publique d\xe9mocratique)',
443 +                        u'Connecticut': u'\xc9tats-Unis',
444 +                        u'Cook (\xeeles)': u'Cook (\xeeles)',
445 +                        u'Corfou': u'Gr\xe8ce',
446 +                        u'Corinthie': u'Gr\xe8ce',
447 +                        u'Corrientes': u'Argentine',
448 +                        u'Corr\xe8ze': u'France',
449 +                        u'Corse': u'France',
450 +                        u'Corse-du-Sud': u'France',
451 +                        u'Cor\xe9e (R\xe9publique populaire d\xe9mocratique)': u'Cor\xe9e (R\xe9publique populaire d\xe9mocratique)',
452 +                        u'Cor\xe9e (R\xe9publique)': u'Cor\xe9e (R\xe9publique)',
453 +                        u'Cor\xe9e avant 1948': u'Cor\xe9e avant 1948',
454 +                        u'Costa Rica': u'Costa Rica',
455 +                        u'Creuse': u'France',
456 +                        u'Croatie': u'Croatie',
457 +                        u'Cr\xe8te': u'Gr\xe8ce',
458 +                        u'Cuba': u'Cuba',
459 +                        u'Cura\xe7ao': u'Cura\xe7ao',
460 +                        u'Cyclades': u'Gr\xe8ce',
461 +                        u'C\xe9phalonie': u'Gr\xe8ce',
462 +                        u'C\xf3rdoba': u'Argentine',
463 +                        u"C\xf4te d'Ivoire": u"C\xf4te d'Ivoire",
464 +                        u"C\xf4te-d'Or": u'France',
465 +                        u"C\xf4tes-d'Armor": u'France',
466 +                        u'Dakota du Nord': u'\xc9tats-Unis',
467 +                        u'Dakota du Sud': u'\xc9tats-Unis',
468 +                        u'Danemark': u'Danemark',
469 +                        u'Delaware': u'\xc9tats-Unis',
470 +                        u'Deux-S\xe8vres': u'France',
471 +                        u'District de Columbia': u'\xc9tats-Unis',
472 +                        u'District f\xe9d\xe9ral': u'Br\xe9sil',
473 +                        u'Djibouti': u'Djibouti',
474 +                        u'Dod\xe9can\xe8se': u'Gr\xe8ce',
475 +                        u'Dominique': u'Dominique',
476 +                        u'Dordogne': u'France',
477 +                        u'Doubs': u'France',
478 +                        u'Drenthe': u'Pays-Bas',
479 +                        u'Dr\xe1ma': u'Gr\xe8ce',
480 +                        u'Dr\xf4me': u'France',
481 +                        u'Durango': u'Mexique',
482 +                        u'D\xe9pendance de Ross (Nouvelle-Z\xe9lande)': u'Antarctique',
483 +                        u'El Salvador': u'El Salvador',
484 +                        u'Entre-Rios': u'Argentine',
485 +                        u'Espagne': u'Espagne',
486 +                        u'Espirito Santo': u'Br\xe9sil',
487 +                        u'Essonne': u'France',
488 +                        u'Estonie': u'Estonie',
489 +                        u'Estr\xe9madure': u'Espagne',
490 +                        u'Eub\xe9e': u'Gr\xe8ce',
491 +                        u'Eure': u'France',
492 +                        u'Eure-et-Loir': u'France',
493 +                        u'Eurytanie': u'Gr\xe8ce',
494 +                        u'Fidji': u'Fidji',
495 +                        u'Finist\xe8re': u'France',
496 +                        u'Finlande': u'Finlande',
497 +                        u'Flandre occidentale': u'Belgique',
498 +                        u'Flandre orientale': u'Belgique',
499 +                        u'Floride': u'\xc9tats-Unis',
500 +                        u'Fl\xf3rina': u'Gr\xe8ce',
501 +                        u'Formosa': u'Argentine',
502 +                        u'France': u'France',
503 +                        u'Franche-Comt\xe9': u'France',
504 +                        u'Fribourg': u'Suisse',
505 +                        u'Frioul-V\xe9n\xe9tie-Julienne': u'Italie',
506 +                        u'Frise': u'Pays-Bas',
507 +                        u'F\xe9ro\xe9 (\xeeles)': u'F\xe9ro\xe9 (\xeeles)',
508 +                        u'Gabon': u'Gabon',
509 +                        u'Galice': u'Espagne',
510 +                        u'Gambie': u'Gambie',
511 +                        u'Gard': u'France',
512 +                        u'Gen\xe8ve': u'Suisse',
513 +                        u'Gers': u'France',
514 +                        u'Ghana': u'Ghana',
515 +                        u'Gibraltar': u'Gibraltar',
516 +                        u'Gironde': u'France',
517 +                        u'Glaris': u'Suisse',
518 +                        u'Goi\xe1s': u'Br\xe9sil',
519 +                        u'Grande-Bretagne': u'Grande-Bretagne',
520 +                        u'Grenade': u'Grenade',
521 +                        u'Greven\xe1': u'Gr\xe8ce',
522 +                        u'Grisons': u'Suisse',
523 +                        u'Groenland': u'Groenland',
524 +                        u'Groningue': u'Pays-Bas',
525 +                        u'Gr\xe8ce': u'Gr\xe8ce',
526 +                        u'Gr\xe8ce centrale': u'Gr\xe8ce',
527 +                        u'Gr\xe8ce occidentale': u'Gr\xe8ce',
528 +                        u'Guadeloupe': u'Guadeloupe',
529 +                        u'Guam': u'Guam',
530 +                        u'Guanajuato': u'Mexique',
531 +                        u'Guatemala': u'Guatemala',
532 +                        u'Gueldre': u'Pays-Bas',
533 +                        u'Guernesey': u'Guernesey',
534 +                        u'Guerrero': u'Mexique',
535 +                        u'Guin\xe9e': u'Guin\xe9e',
536 +                        u'Guin\xe9e \xe9quatoriale': u'Guin\xe9e \xe9quatoriale',
537 +                        u'Guin\xe9e-Bissau': u'Guin\xe9e-Bissau',
538 +                        u'Guyana': u'Guyana',
539 +                        u'Guyane fran\xe7aise': u'Guyane fran\xe7aise',
540 +                        u'G\xe9orgie': u'\xc9tats-Unis',
541 +                        u'G\xe9orgie du Sud et les \xeeles Sandwich du Sud': u'G\xe9orgie du Sud et les \xeeles Sandwich du Sud',
542 +                        u'Hainaut': u'Belgique',
543 +                        u'Hambourg': u'Allemagne',
544 +                        u'Haut-Rhin': u'France',
545 +                        u'Haute-Autriche': u'Autriche',
546 +                        u'Haute-Corse': u'France',
547 +                        u'Haute-Garonne': u'France',
548 +                        u'Haute-Loire': u'France',
549 +                        u'Haute-Marne': u'France',
550 +                        u'Haute-Normandie': u'France',
551 +                        u'Haute-Savoie': u'France',
552 +                        u'Haute-Sa\xf4ne': u'France',
553 +                        u'Haute-Vienne': u'France',
554 +                        u'Hautes-Alpes': u'France',
555 +                        u'Hautes-Pyr\xe9n\xe9es': u'France',
556 +                        u'Hauts-de-Seine': u'France',
557 +                        u'Hawaii': u'\xc9tats-Unis',
558 +                        u'Ha\xefti': u'Ha\xefti',
559 +                        u'Heard (\xeele) et \xeeles McDonald': u'Heard (\xeele) et \xeeles McDonald',
560 +                        u'Hesse': u'Allemagne',
561 +                        u'Hidalgo': u'Mexique',
562 +                        u'Hollande-M\xe9ridionale': u'Pays-Bas',
563 +                        u'Hollande-Septentrionale': u'Pays-Bas',
564 +                        u'Honduras': u'Honduras',
565 +                        u'Hong Kong': u'Hong Kong',
566 +                        u'Hongrie': u'Hongrie',
567 +                        u'H\xe9rault': u'France',
568 +                        u'Idaho': u'\xc9tats-Unis',
569 +                        u'Ille-et-Vilaine': u'France',
570 +                        u'Illinois': u'\xc9tats-Unis',
571 +                        u'Inde': u'Inde',
572 +                        u'Indiana': u'\xc9tats-Unis',
573 +                        u'Indon\xe9sie': u'Indon\xe9sie',
574 +                        u'Indre': u'France',
575 +                        u'Indre-et-Loire': u'France',
576 +                        u'Iowa': u'\xc9tats-Unis',
577 +                        u'Io\xe1nnina': u'Gr\xe8ce',
578 +                        u'Irak': u'Irak',
579 +                        u'Iran': u'Iran',
580 +                        u'Irlande': u'Irlande',
581 +                        u'Ir\xe1kleion': u'Gr\xe8ce',
582 +                        u'Islande': u'Islande',
583 +                        u'Isra\xebl': u'Isra\xebl',
584 +                        u'Is\xe8re': u'France',
585 +                        u'Italie': u'Italie',
586 +                        u'Jalisco': u'Mexique',
587 +                        u'Jama\xefque': u'Jama\xefque',
588 +                        u'Japon': u'Japon',
589 +                        u'Jersey': u'Jersey',
590 +                        u'Jordanie': u'Jordanie',
591 +                        u'Jujuy': u'Argentine',
592 +                        u'Jura': u'France',
593 +                        u'Kansas': u'\xc9tats-Unis',
594 +                        u'Kard\xedtsa': u'Gr\xe8ce',
595 +                        u'Kastori\xe1': u'Gr\xe8ce',
596 +                        u'Kav\xe1la': u'Gr\xe8ce',
597 +                        u'Kazakhstan': u'Kazakhstan',
598 +                        u'Kentucky': u'\xc9tats-Unis',
599 +                        u'Kenya': u'Kenya',
600 +                        u'Kilk\xeds': u'Gr\xe8ce',
601 +                        u'Kirghizistan': u'Kirghizistan',
602 +                        u'Kiribati': u'Kiribati',
603 +                        u'Kosovo': u'Kosovo',
604 +                        u'Kowe\xeft': u'Kowe\xeft',
605 +                        u'Koz\xe1ni': u'Gr\xe8ce',
606 +                        u'La Can\xe9e': u'Gr\xe8ce',
607 +                        u'Laconie': u'Gr\xe8ce',
608 +                        u'Landes': u'France',
609 +                        u'Languedoc-Roussillon': u'France',
610 +                        u'Laos': u'Laos',
611 +                        u'Las\xedthi': u'Gr\xe8ce',
612 +                        u'Latium': u'Italie',
613 +                        u'Le Pir\xe9e': u'Gr\xe8ce',
614 +                        u'Lesotho': u'Lesotho',
615 +                        u'Lettonie': u'Lettonie',
616 +                        u'Leucade': u'Gr\xe8ce',
617 +                        u'Liban': u'Liban',
618 +                        u'Liberia': u'Liberia',
619 +                        u'Libye': u'Libye',
620 +                        u'Liechtenstein': u'Liechtenstein',
621 +                        u'Ligurie': u'Italie',
622 +                        u'Limbourg': u'Pays-Bas',
623 +                        u'Limousin': u'France',
624 +                        u'Lituanie': u'Lituanie',
625 +                        u'Li\xe8ge': u'Belgique',
626 +                        u'Loir-et-Cher': u'France',
627 +                        u'Loire': u'France',
628 +                        u'Loire-Atlantique': u'France',
629 +                        u'Loiret': u'France',
630 +                        u'Lombardie': u'Italie',
631 +                        u'Lorraine': u'France',
632 +                        u'Lot': u'France',
633 +                        u'Lot-et-Garonne': u'France',
634 +                        u'Louisiane': u'\xc9tats-Unis',
635 +                        u'Loz\xe8re': u'France',
636 +                        u'Lucerne': u'Suisse',
637 +                        u'Luxembourg': u'Belgique',
638 +                        u'L\xe1risa': u'Gr\xe8ce',
639 +                        u'L\xe9svos': u'Gr\xe8ce',
640 +                        u'Macao': u'Macao',
641 +                        u'Mac\xe9doine (R\xe9publique)': u'Mac\xe9doine (R\xe9publique)',
642 +                        u'Mac\xe9doine centrale': u'Gr\xe8ce',
643 +                        u'Mac\xe9doine occidentale': u'Gr\xe8ce',
644 +                        u'Mac\xe9doine orientale et Thrace': u'Gr\xe8ce',
645 +                        u'Madagascar': u'Madagascar',
646 +                        u'Magn\xe9sie': u'Gr\xe8ce',
647 +                        u'Maine': u'\xc9tats-Unis',
648 +                        u'Maine-et-Loire': u'France',
649 +                        u'Malaisie': u'Malaisie',
650 +                        u'Malawi': u'Malawi',
651 +                        u'Maldives': u'Maldives',
652 +                        u'Mali': u'Mali',
653 +                        u'Malouines (\xeeles)': u'Malouines (\xeeles)',
654 +                        u'Malte': u'Malte',
655 +                        u'Manche': u'France',
656 +                        u'Manitoba': u'Canada',
657 +                        u'Maranh\xe3o': u'Br\xe9sil',
658 +                        u'Marches': u'Italie',
659 +                        u'Mariannes du Nord (\xeeles)': u'Mariannes du Nord (\xeeles)',
660 +                        u'Marne': u'France',
661 +                        u'Maroc': u'Maroc',
662 +                        u'Marshall (\xeeles)': u'Marshall (\xeeles)',
663 +                        u'Martinique': u'Martinique',
664 +                        u'Maryland': u'\xc9tats-Unis',
665 +                        u'Massachusetts': u'\xc9tats-Unis',
666 +                        u'Mato grosso': u'Br\xe9sil',
667 +                        u'Mato grosso do Sul': u'Br\xe9sil',
668 +                        u'Maurice': u'Maurice',
669 +                        u'Mauritanie': u'Mauritanie',
670 +                        u'Mayenne': u'France',
671 +                        u'Mayotte': u'Mayotte',
672 +                        u'Mecklembourg-Pom\xe9ranie ant\xe9rieure': u'Allemagne',
673 +                        u'Melilla': u'Espagne',
674 +                        u'Mendoza': u'Argentine',
675 +                        u'Mess\xe9nie': u'Gr\xe8ce',
676 +                        u'Meurthe-et-Moselle': u'France',
677 +                        u'Meuse': u'France',
678 +                        u'Mexico': u'Mexique',
679 +                        u'Mexique': u'Mexique',
680 +                        u'Michigan': u'\xc9tats-Unis',
681 +                        u'Michoac\xe1n': u'Mexique',
682 +                        u'Micron\xe9sie': u'Micron\xe9sie',
683 +                        u'Midi-Pyr\xe9n\xe9es': u'France',
684 +                        u'Minas Gerais': u'Br\xe9sil',
685 +                        u'Minnesota': u'\xc9tats-Unis',
686 +                        u'Misiones': u'Argentine',
687 +                        u'Mississippi': u'\xc9tats-Unis',
688 +                        u'Missouri': u'\xc9tats-Unis',
689 +                        u'Moldavie': u'Moldavie',
690 +                        u'Molise': u'Italie',
691 +                        u'Monaco': u'Monaco',
692 +                        u'Mongolie': u'Mongolie',
693 +                        u'Montana': u'\xc9tats-Unis',
694 +                        u'Montserrat': u'Montserrat',
695 +                        u'Mont\xe9n\xe9gro': u'Mont\xe9n\xe9gro',
696 +                        u'Morbihan': u'France',
697 +                        u'Morelos': u'Mexique',
698 +                        u'Moselle': u'France',
699 +                        u'Mozambique': u'Mozambique',
700 +                        u'Murcie': u'Espagne',
701 +                        u'Myanmar': u'Myanmar',
702 +                        u'Namibie': u'Namibie',
703 +                        u'Namur': u'Belgique',
704 +                        u'Nauru': u'Nauru',
705 +                        u'Navarre': u'Espagne',
706 +                        u'Nayarit': u'Mexique',
707 +                        u'Nebraska': u'\xc9tats-Unis',
708 +                        u'Neuch\xe2tel': u'Suisse',
709 +                        u'Neuqu\xe9n': u'Argentine',
710 +                        u'Nevada': u'\xc9tats-Unis',
711 +                        u'New Hampshire': u'\xc9tats-Unis',
712 +                        u'New Jersey': u'\xc9tats-Unis',
713 +                        u'New York': u'\xc9tats-Unis',
714 +                        u'Nicaragua': u'Nicaragua',
715 +                        u'Nidwald': u'Suisse',
716 +                        u'Niger': u'Niger',
717 +                        u'Nigeria': u'Nigeria',
718 +                        u'Niue': u'Niue',
719 +                        u'Ni\xe8vre': u'France',
720 +                        u'Nord': u'France',
721 +                        u'Nord-Pas-de-Calais': u'France',
722 +                        u'Norfolk (\xeele)': u'Norfolk (\xeele)',
723 +                        u'Norv\xe8ge': u'Norv\xe8ge',
724 +                        u'Nouveau Mexique': u'\xc9tats-Unis',
725 +                        u'Nouveau-Brunswick': u'Canada',
726 +                        u'Nouvelle-Cal\xe9donie': u'Nouvelle-Cal\xe9donie',
727 +                        u'Nouvelle-Galles-du-Sud': u'Australie',
728 +                        u'Nouvelle-Z\xe9lande': u'Nouvelle-Z\xe9lande',
729 +                        u'Nouvelle-\xc9cosse': u'Canada',
730 +                        u'Nuevo Le\xf3n': u'Mexique',
731 +                        u'N\xe9pal': u'N\xe9pal',
732 +                        u'Oaxaca': u'Mexique',
733 +                        u'Obwald': u'Suisse',
734 +                        u'Ohio': u'\xc9tats-Unis',
735 +                        u'Oise': u'France',
736 +                        u'Oklahoma': u'\xc9tats-Unis',
737 +                        u'Oman': u'Oman',
738 +                        u'Ombrie': u'Italie',
739 +                        u'Ontario': u'Canada',
740 +                        u'Oregon': u'\xc9tats-Unis',
741 +                        u'Orne': u'France',
742 +                        u'Ouganda': u'Ouganda',
743 +                        u'Ouzb\xe9kistan': u'Ouzb\xe9kistan',
744 +                        u'Overijssell': u'Pays-Bas',
745 +                        u'Pakistan': u'Pakistan',
746 +                        u'Palau': u'Palau',
747 +                        u'Pampa': u'Argentine',
748 +                        u'Panama': u'Panama',
749 +                        u'Papouasie-Nouvelle-Guin\xe9e': u'Papouasie-Nouvelle-Guin\xe9e',
750 +                        u'Paraguay': u'Paraguay',
751 +                        u'Paraiba': u'Br\xe9sil',
752 +                        u'Param\xe1': u'Br\xe9sil',
753 +                        u'Paris': u'France',
754 +                        u'Par\xe1': u'Br\xe9sil',
755 +                        u'Pas-de-Calais': u'France',
756 +                        u'Pays Basque': u'Espagne',
757 +                        u'Pays-Bas': u'Pays-Bas',
758 +                        u'Pays-Bas avant 1830': u'Pays-Bas avant 1830',
759 +                        u'Pays-de-la-Loire': u'France',
760 +                        u'Pennsylvanie': u'\xc9tats-Unis',
761 +                        u'Pernambouc': u'Br\xe9sil',
762 +                        u'Philippines': u'Philippines',
763 +                        u'Phocide': u'Gr\xe8ce',
764 +                        u'Phtiotide': u'Gr\xe8ce',
765 +                        u'Piau\xed': u'Br\xe9sil',
766 +                        u'Picardie': u'France',
767 +                        u'Pitcairn': u'Pitcairn',
768 +                        u'Pi\xe9mont': u'Italie',
769 +                        u'Pi\xe9rie': u'Gr\xe8ce',
770 +                        u'Poitou-Charentes': u'France',
771 +                        u'Pologne': u'Pologne',
772 +                        u'Polyn\xe9sie fran\xe7aise': u'Polyn\xe9sie fran\xe7aise',
773 +                        u'Porto Rico': u'Porto Rico',
774 +                        u'Portugal': u'Portugal',
775 +                        u'Pouilles': u'Italie',
776 +                        u"Provence-Alpes-C\xf4te d'Azur": u'France',
777 +                        u'Pr\xe9veza': u'Gr\xe8ce',
778 +                        u'Puebla': u'Mexique',
779 +                        u'Puy-de-D\xf4me': u'France',
780 +                        u'Pyr\xe9n\xe9es-Atlantiques': u'France',
781 +                        u'Pyr\xe9n\xe9es-Orientales': u'France',
782 +                        u'P\xe9lla': u'Gr\xe8ce',
783 +                        u'P\xe9loponn\xe8se': u'Gr\xe8ce',
784 +                        u'P\xe9rou': u'P\xe9rou',
785 +                        u'Qatar': u'Qatar',
786 +                        u'Queensland': u'Australie',
787 +                        u'Quer\xe9taro': u'Mexique',
788 +                        u'Quintana Roo': u'Mexique',
789 +                        u'Qu\xe9bec': u'Canada',
790 +                        u'Rhode Island': u'\xc9tats-Unis',
791 +                        u'Rhodope': u'Gr\xe8ce',
792 +                        u'Rh\xe9nanie-Palatinat': u'Allemagne',
793 +                        u'Rh\xe9nanie-du-Nord-Westphalie': u'Allemagne',
794 +                        u'Rh\xf4ne': u'France',
795 +                        u'Rh\xf4ne-Alpes': u'France',
796 +                        u'Rio Grande do Norte': u'Br\xe9sil',
797 +                        u'Rio Grande do Sul': u'Br\xe9sil',
798 +                        u'Rio Negro': u'Argentine',
799 +                        u'Rio de Janeiro': u'Br\xe9sil',
800 +                        u'Rioja': u'Argentine',
801 +                        u'Rond\xf4nia': u'Br\xe9sil',
802 +                        u'Roraima': u'Br\xe9sil',
803 +                        u'Roumanie': u'Roumanie',
804 +                        u'Royaume-Uni': u'Grande-Bretagne',
805 +                        u'Russie (F\xe9d\xe9ration)': u'Russie (F\xe9d\xe9ration)',
806 +                        u'Rwanda': u'Rwanda',
807 +                        u'R\xc3\xa9publique Tch\xc3\xa8que': u'R\xc3\xa9publique tch\xc3\xa8que',
808 +                        u'R\xe9publique dominicaine': u'R\xe9publique dominicaine',
809 +                        u'R\xe9publique tch\xe8que': u'R\xe9publique tch\xe8que',
810 +                        u'R\xe9thymnon': u'Gr\xe8ce',
811 +                        u'R\xe9union': u'R\xe9union',
812 +                        u'Sahara occidental': u'Sahara occidental',
813 +                        u'Saint-Barth\xe9lemy': u'Saint-Barth\xe9lemy',
814 +                        u'Saint-Gall': u'Suisse',
815 +                        u'Saint-Kitts-et-Nevis': u'Saint-Kitts-et-Nevis',
816 +                        u'Saint-Marin': u'Saint-Marin',
817 +                        u'Saint-Martin (partie fran\xe7aise)': u'Saint-Martin (partie fran\xe7aise)',
818 +                        u'Saint-Martin (partie n\xe9erlandaise)': u'Saint-Martin (partie n\xe9erlandaise)',
819 +                        u'Saint-Pierre-et-Miquelon': u'Saint-Pierre-et-Miquelon',
820 +                        u'Saint-Vincent-et-les Grenadines': u'Saint-Vincent-et-les Grenadines',
821 +                        u'Sainte-H\xe9l\xe8ne': u'Sainte-H\xe9l\xe8ne',
822 +                        u'Sainte-Lucie': u'Sainte-Lucie',
823 +                        u'Salomon (\xeeles)': u'Salomon (\xeeles)',
824 +                        u'Salta': u'Argentine',
825 +                        u'Salzbourg': u'Autriche',
826 +                        u'Samoa': u'Samoa',
827 +                        u'Samoa am\xe9ricaines': u'Samoa am\xe9ricaines',
828 +                        u'San Juan': u'Argentine',
829 +                        u'San Luis': u'Argentine',
830 +                        u'San Luis Potos\xed': u'Mexique',
831 +                        u'Santa Catarina': u'Br\xe9sil',
832 +                        u'Santa Cruz': u'Argentine',
833 +                        u'Santa Fe': u'Argentine',
834 +                        u'Santiago del Estero': u'Argentine',
835 +                        u'Sao Tom\xe9-et-Principe': u'Sao Tom\xe9-et-Principe',
836 +                        u'Sardaigne': u'Italie',
837 +                        u'Sarre': u'Allemagne',
838 +                        u'Sarthe': u'France',
839 +                        u'Saskatchewan': u'Canada',
840 +                        u'Savoie': u'France',
841 +                        u'Saxe': u'Allemagne',
842 +                        u'Saxe-Anhalt': u'Allemagne',
843 +                        u'Sa\xf4ne-et-Loire': u'France',
844 +                        u'Schaffhouse': u'Suisse',
845 +                        u'Schleswig-Holstein': u'Allemagne',
846 +                        u'Schwyz': u'Suisse',
847 +                        u'Seine-Maritime': u'France',
848 +                        u'Seine-Saint-Denis': u'France',
849 +                        u'Seine-et-Marne': u'France',
850 +                        u'Serbie': u'Serbie',
851 +                        u'Serbie-et-Mont\xe9n\xe9gro': u'Serbie-et-Mont\xe9n\xe9gro',
852 +                        u'Sergipe': u'Br\xe9sil',
853 +                        u'Seychelles': u'Seychelles',
854 +                        u'Sicile': u'Italie',
855 +                        u'Sierra Leone': u'Sierra Leone',
856 +                        u'Sinaloa': u'Mexique',
857 +                        u'Singapour': u'Singapour',
858 +                        u'Slovaquie': u'Slovaquie',
859 +                        u'Slov\xe9nie': u'Slov\xe9nie',
860 +                        u'Soleure': u'Suisse',
861 +                        u'Somalie': u'Somalie',
862 +                        u'Somme': u'France',
863 +                        u'Sonora': u'Mexique',
864 +                        u'Soudan': u'Soudan',
865 +                        u'Soudan du Sud': u'Soudan du Sud',
866 +                        u'Sri Lanka': u'Sri Lanka',
867 +                        u'Styrie': u'Autriche',
868 +                        u'Suisse': u'Suisse',
869 +                        u'Suriname': u'Suriname',
870 +                        u'Su\xe8de': u'Su\xe8de',
871 +                        u'Svalbard et \xeele Jan Mayen': u'Svalbard et \xeele Jan Mayen',
872 +                        u'Swaziland': u'Swaziland',
873 +                        u'Syrie': u'Syrie',
874 +                        u'S\xe1mos': u'Gr\xe8ce',
875 +                        u'S\xe3o Paulo': u'Br\xe9sil',
876 +                        u'S\xe9n\xe9gal': u'S\xe9n\xe9gal',
877 +                        u'S\xe9rrai': u'Gr\xe8ce',
878 +                        u'Tabasco': u'Mexique',
879 +                        u'Tadjikistan': u'Tadjikistan',
880 +                        u'Tamaulipas': u'Mexique',
881 +                        u'Tanzanie': u'Tanzanie',
882 +                        u'Tarn': u'France',
883 +                        u'Tarn-et-Garonne': u'France',
884 +                        u'Tasmanie': u'Australie',
885 +                        u'Ta\xefwan': u'Ta\xefwan',
886 +                        u'Tchad': u'Tchad',
887 +                        u'Tch\xe9coslovaquie': u'Tch\xe9coslovaquie',
888 +                        u'Tennessee': u'\xc9tats-Unis',
889 +                        u'Terre de Feu': u'Argentine',
890 +                        u'Terre de la Reine-Maud (Norv\xe8ge)': u'Antarctique',
891 +                        u'Terre-Neuve': u'Canada',
892 +                        u'Terres australes et antarctiques fran\xe7aises': u'Antarctique',
893 +                        u'Terres australes fran\xe7aises': u'Terres australes fran\xe7aises',
894 +                        u'Territoire antarctique australien': u'Antarctique',
895 +                        u'Territoire antarctique britannique': u'Antarctique',
896 +                        u"Territoire britannique de l'Oc\xe9an indien": u"Territoire britannique de l'Oc\xe9an indien",
897 +                        u'Territoire de la capitale australienne': u'Australie',
898 +                        u'Territoire du Nord': u'Australie',
899 +                        u'Territoire du Yukon': u'Canada',
900 +                        u'Territoire-de-Belfort': u'France',
901 +                        u'Territoires du Nord-Ouest': u'Canada',
902 +                        u'Tessin': u'Suisse',
903 +                        u'Texas': u'\xc9tats-Unis',
904 +                        u'Tha\xeflande': u'Tha\xeflande',
905 +                        u'Thesprotie': u'Gr\xe8ce',
906 +                        u'Thessalie': u'Gr\xe8ce',
907 +                        u'Thessalonique': u'Gr\xe8ce',
908 +                        u'Thurgovie': u'Suisse',
909 +                        u'Thuringe': u'Allemagne',
910 +                        u'Timor oriental': u'Timor oriental',
911 +                        u'Tlaxcala': u'Mexique',
912 +                        u'Togo': u'Togo',
913 +                        u'Tokelau': u'Tokelau',
914 +                        u'Tonga': u'Tonga',
915 +                        u'Toscane': u'Italie',
916 +                        u'Trentin-Haut-Adige': u'Italie',
917 +                        u'Trinit\xe9-et-Tobago': u'Trinit\xe9-et-Tobago',
918 +                        u'Tr\xedkala': u'Gr\xe8ce',
919 +                        u'Tucum\xe1n': u'Argentine',
920 +                        u'Tunisie': u'Tunisie',
921 +                        u'Turkm\xe9nistan': u'Turkm\xe9nistan',
922 +                        u'Turks et Ca\xefques (\xeeles)': u'Turks et Ca\xefques (\xeeles)',
923 +                        u'Turquie': u'Turquie',
924 +                        u'Tuvalu': u'Tuvalu',
925 +                        u'Tyrol': u'Autriche',
926 +                        u'URSS': u'URSS',
927 +                        u'US': u'\xc9tats-Unis',
928 +                        'USA': u'\xc9tats-Unis',
929 +                        u'Ukraine': u'Ukraine',
930 +                        u'Uri': u'Suisse',
931 +                        u'Uruguay': u'Uruguay',
932 +                        u'Utah': u'\xc9tats-Unis',
933 +                        u'Utrecht': u'Pays-Bas',
934 +                        u"Val d'Aoste": u'Italie',
935 +                        u"Val-d'Oise": u'France',
936 +                        u'Val-de-Marne': u'France',
937 +                        u'Valais': u'Suisse',
938 +                        u'Vanuatu': u'Vanuatu',
939 +                        u'Var': u'France',
940 +                        u'Vatican': u'Vatican',
941 +                        u'Vaucluse': u'France',
942 +                        u'Vaud': u'Suisse',
943 +                        u'Vend\xe9e': u'France',
944 +                        u'Venezuela': u'Venezuela',
945 +                        u'Veracruz': u'Mexique',
946 +                        u'Vermont': u'\xc9tats-Unis',
947 +                        u'Victoria': u'Australie',
948 +                        u'Vienne': u'Autriche',
949 +                        u'Viet Nam': u'Viet Nam',
950 +                        u'Virginie': u'\xc9tats-Unis',
951 +                        u'Virginie occidentale': u'\xc9tats-Unis',
952 +                        u'Vorarlberg': u'Autriche',
953 +                        u'Vosges': u'France',
954 +                        u'V\xe9n\xe9tie': u'Italie',
955 +                        u'Wallis et Futuna (\xeeles)': u'Wallis et Futuna (\xeeles)',
956 +                        u'Washington': u'\xc9tats-Unis',
957 +                        u'Wisconsin': u'\xc9tats-Unis',
958 +                        u'Wyoming': u'\xc9tats-Unis',
959 +                        u'X\xe1nthi': u'Gr\xe8ce',
960 +                        u'Yonne': u'France',
961 +                        u'Yougoslavie': u'Yougoslavie',
962 +                        u'Yucat\xe1n': u'Mexique',
963 +                        u'Yvelines': u'France',
964 +                        u'Y\xe9men': u'Y\xe9men',
965 +                        u'Y\xe9men (R\xe9publique d\xe9mocratique populaire)': u'Y\xe9men (R\xe9publique d\xe9mocratique populaire)',
966 +                        u'Zacatecas': u'Mexique',
967 +                        u'Zambie': u'Zambie',
968 +                        u'Zimbabwe': u'Zimbabwe',
969 +                        u'Zoug': u'Suisse',
970 +                        u'Zurich': u'Suisse',
971 +                        u'Z\xe1kynthos': u'Gr\xe8ce',
972 +                        u'Z\xe9lande': u'Pays-Bas',
973 +                        u'aire g\xe9ographique ancienne': u'aire g\xe9ographique ancienne',
974 +                        u'code non adapt\xe9': u'code non adapt\xe9',
975 +                        u'inconnu': u'inconnu',
976 +                        u'intergouvernemental': u'intergouvernemental',
977 +                        u'multiple': u'multiple',
978 +                        u"ne s'applique pas": u"ne s'applique pas",
979 +                        u'non renseign\xe9': u'non renseign\xe9',
980 +                        u'\xc1rta': u'Gr\xe8ce',
981 +                        u'\xc9gypte': u'\xc9gypte',
982 +                        u'\xc9lide': u'Gr\xe8ce',
983 +                        u'\xc9mathie': u'Gr\xe8ce',
984 +                        u'\xc9milie-Romagne': u'Italie',
985 +                        u'\xc9mirats arabes unis': u'\xc9mirats arabes unis',
986 +                        u'\xc9pire': u'Gr\xe8ce',
987 +                        u'\xc9quateur': u'\xc9quateur',
988 +                        u'\xc9rythr\xe9e': u'\xc9rythr\xe9e',
989 +                        u'\xc9tats-Unis': u'\xc9tats-Unis',
990 +                        u'\xc9thiopie': u'\xc9thiopie',
991 +                        u'\xc9tolie-et-Acarnanie': u'Gr\xe8ce',
992 +                        u'\xc9vros': u'Gr\xe8ce',
993 +                        u'\xcele Pierre 1er (Norv\xe8ge)': u'Antarctique',
994 +                        u'\xcele de Man': u'\xcele de Man',
995 +                        u'\xcele du Prince-\xc9douard': u'Canada',
996 +                        u'\xcele-de-France': u'France',
997 +                        u'\xceles Bal\xe9ares': u'Espagne',
998 +                        u'\xceles Ioniennes': u'Gr\xe8ce',
999 +                        u'\xceles Vierges am\xe9ricaines': u'\xceles Vierges am\xe9ricaines',
1000 +                        u'\xceles Vierges britanniques': u'\xceles Vierges britanniques',
1001 +                        u'\xceles de la Mer \xc9g\xe9e m\xe9ridionale': u'Gr\xe8ce',
1002 +                        u'\xceles de la Mer \xc9g\xe9e septentrionale': u'Gr\xe8ce',
1003 +                        u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis': u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis'
1004 +                                                }
diff --git a/data/countries_iso_3166.txt b/data/countries_iso_3166.txt
@@ -0,0 +1,269 @@
1005 +##,non renseigné
1006 +..,non renseigné
1007 +aa,aire géographique ancienne
1008 +ad,Andorre
1009 +ae,Émirats arabes unis
1010 +af,Afghanistan
1011 +ag,Antigua-et-Barbuda
1012 +ai,Anguilla
1013 +al,Albanie
1014 +am,Arménie
1015 +an,Antilles néerlandaises
1016 +ao,Angola
1017 +aq,Antarctique
1018 +ar,Argentine
1019 +as,Samoa américaines
1020 +at,Autriche
1021 +au,Australie
1022 +aw,Aruba
1023 +ax,Aland (îles)
1024 +az,Azerbaïdjan
1025 +ba,Bosnie-Herzégovine
1026 +bb,Barbade
1027 +bd,Bangladesh
1028 +be,Belgique
1029 +bf,Burkina
1030 +bg,Bulgarie
1031 +bh,Bahreïn
1032 +bi,Burundi
1033 +bj,Bénin
1034 +bl,Saint-Barthélemy
1035 +bm,Bermudes
1036 +bn,Brunéi
1037 +bo,Bolivie
1038 +bq,Bonaire, Saint-Eustache et Saba,Saba###Saint-Eustache
1039 +br,Brésil
1040 +bs,Bahamas
1041 +bt,Bhoutan
1042 +bv,Bouvet (île)
1043 +bw,Botswana
1044 +by,Biélorussie,Bélarus
1045 +bz,Belize
1046 +ca,Canada
1047 +cc,Cocos (îles),Keeling (îles)
1048 +cd,Congo (République démocratique),Zaïre
1049 +cf,Centrafrique,République centrafricaine
1050 +cg,Congo,Congo (République)
1051 +ch,Suisse,Confédération helvétique
1052 +ci,Côte d'Ivoire
1053 +ck,Cook (îles)
1054 +cl,Chili
1055 +cm,Cameroun
1056 +cn,Chine,Chine (République populaire)
1057 +co,Colombie
1058 +cr,Costa Rica
1059 +cs,Serbie-et-Monténégro
1060 +cu,Cuba
1061 +cv,Cap-Vert
1062 +cw,Curaçao
1063 +cx,Christmas (île)
1064 +cy,Chypre
1065 +cz,République tchèque,Tchèque, République
1066 +dd,Allemagne (République démocratique)
1067 +de,Allemagne,Allemagne (République fédérale)
1068 +dj,Djibouti
1069 +dk,Danemark
1070 +dm,Dominique
1071 +do,République dominicaine,Dominicaine, République
1072 +dz,Algérie
1073 +ec,Équateur
1074 +ee,Estonie
1075 +eg,Égypte
1076 +eh,Sahara occidental
1077 +er,Érythrée
1078 +es,Espagne
1079 +et,Éthiopie
1080 +fi,Finlande
1081 +fj,Fidji
1082 +fk,Malouines (îles),Falkland (îles)
1083 +fm,Micronésie,États fédérés de Micronésie
1084 +fo,Féroé (îles)
1085 +fr,France
1086 +ga,Gabon
1087 +gb,Grande-Bretagne,Royaume-Uni
1088 +gd,Grenade
1089 +ge,Géorgie
1090 +gf,Guyane française
1091 +gg,Guernesey
1092 +gh,Ghana
1093 +gi,Gibraltar
1094 +gl,Groenland
1095 +gm,Gambie
1096 +gn,Guinée
1097 +gp,Guadeloupe
1098 +gq,Guinée équatoriale
1099 +gr,Grèce
1100 +gs,Géorgie du Sud et les îles Sandwich du Sud
1101 +gt,Guatemala
1102 +gu,Guam
1103 +gw,Guinée-Bissau
1104 +gy,Guyana
1105 +hk,Hong Kong
1106 +hm,Heard (île) et îles McDonald
1107 +hn,Honduras
1108 +hr,Croatie
1109 +ht,Haïti
1110 +hu,Hongrie
1111 +id,Indonésie
1112 +ie,Irlande
1113 +ii,intergouvernemental
1114 +il,Israël
1115 +im,Île de Man,Man, Île de
1116 +in,Inde
1117 +io,Territoire britannique de l'Océan indien,Chagos (îles)###Océan indien, Territoire britannique de l'
1118 +iq,Irak
1119 +ir,Iran
1120 +is,Islande
1121 +it,Italie
1122 +je,Jersey
1123 +jm,Jamaïque
1124 +jo,Jordanie
1125 +jp,Japon
1126 +ke,Kenya
1127 +kg,Kirghizistan
1128 +kh,Cambodge
1129 +ki,Kiribati
1130 +km,Comores
1131 +kn,Saint-Kitts-et-Nevis,Saint-Christophe-et-Nevis
1132 +ko,Kosovo
1133 +kp,Corée (République populaire démocratique),Corée du Nord
1134 +kr,Corée (République),Corée du Sud
1135 +kw,Koweït
1136 +ky,Cayman,Caïmanes, Îles###Caïman (îles)
1137 +kz,Kazakhstan
1138 +la,Laos
1139 +lb,Liban
1140 +lc,Sainte-Lucie
1141 +li,Liechtenstein
1142 +lk,Sri Lanka
1143 +lr,Liberia
1144 +ls,Lesotho
1145 +lt,Lituanie
1146 +lu,Luxembourg
1147 +lv,Lettonie
1148 +ly,Libye
1149 +ma,Maroc
1150 +mc,Monaco
1151 +md,Moldavie,Moldova, République de
1152 +me,Monténégro
1153 +mf,Saint-Martin (partie française)
1154 +mg,Madagascar
1155 +mh,Marshall (îles)
1156 +mk,Macédoine (République)
1157 +ml,Mali
1158 +mm,Myanmar,Birmanie
1159 +mn,Mongolie
1160 +mo,Macao
1161 +mp,Mariannes du Nord (îles)
1162 +mq,Martinique
1163 +mr,Mauritanie
1164 +ms,Montserrat
1165 +mt,Malte
1166 +mu,Maurice
1167 +mv,Maldives
1168 +mw,Malawi
1169 +mx,Mexique
1170 +my,Malaisie
1171 +mz,Mozambique
1172 +na,Namibie
1173 +nc,Nouvelle-Calédonie
1174 +ne,Niger
1175 +nf,Norfolk (île)
1176 +ng,Nigeria
1177 +ni,Nicaragua
1178 +nl,Pays-Bas
1179 +no,Norvège
1180 +np,Népal
1181 +nr,Nauru
1182 +nu,Niue
1183 +nz,Nouvelle-Zélande
1184 +om,Oman
1185 +oo,code non adapté
1186 +pa,Panama
1187 +pe,Pérou
1188 +pf,Polynésie française
1189 +pg,Papouasie-Nouvelle-Guinée
1190 +ph,Philippines
1191 +pk,Pakistan
1192 +pl,Pologne
1193 +pm,Saint-Pierre-et-Miquelon
1194 +pn,Pitcairn
1195 +pr,Porto Rico
1196 +ps,Autorité palestinienne,Palestine
1197 +pt,Portugal
1198 +pw,Palau,Palaos
1199 +py,Paraguay
1200 +qa,Qatar
1201 +re,Réunion
1202 +ro,Roumanie
1203 +rs,Serbie
1204 +ru,Russie (Fédération),Russie
1205 +rw,Rwanda
1206 +sa,Arabie saoudite
1207 +sb,Salomon (îles)
1208 +sc,Seychelles
1209 +sd,Soudan
1210 +se,Suède
1211 +sg,Singapour
1212 +sh,Sainte-Hélène,Ascension (île)###Tristan da Cunha (île)
1213 +si,Slovénie
1214 +sj,Svalbard et île Jan Mayen
1215 +sk,Slovaquie
1216 +sl,Sierra Leone
1217 +sm,Saint-Marin
1218 +sn,Sénégal
1219 +so,Somalie
1220 +sr,Suriname
1221 +ss,Soudan du Sud,Sud Soudan
1222 +st,Sao Tomé-et-Principe
1223 +su,URSS
1224 +sv,El Salvador,Salvador
1225 +sx,Saint-Martin (partie néerlandaise),Sint Maarten
1226 +sy,Syrie
1227 +sz,Swaziland
1228 +tc,Turks et Caïques (îles)
1229 +td,Tchad
1230 +tf,Terres australes françaises
1231 +tg,Togo
1232 +th,Thaïlande
1233 +tj,Tadjikistan
1234 +tk,Tokelau
1235 +tl,Timor oriental
1236 +tm,Turkménistan
1237 +tn,Tunisie
1238 +to,Tonga
1239 +tr,Turquie
1240 +tt,Trinité-et-Tobago
1241 +tv,Tuvalu
1242 +tw,Taïwan,Chine (République)
1243 +tz,Tanzanie
1244 +ua,Ukraine
1245 +ug,Ouganda
1246 +um,Îles mineures éloignées des États-Unis
1247 +us,États-Unis
1248 +uy,Uruguay
1249 +uz,Ouzbékistan
1250 +va,Vatican,Saint-Siège
1251 +vc,Saint-Vincent-et-les Grenadines
1252 +ve,Venezuela
1253 +vg,Îles Vierges britanniques,Vierges (îles) britanniques
1254 +vi,Îles Vierges américaines,Vierges (îles) américaines
1255 +vn,Viet Nam
1256 +vu,Vanuatu
1257 +wf,Wallis et Futuna (îles)
1258 +ws,Samoa,Samoa occidentales
1259 +xc,Tchécoslovaquie
1260 +xd,Allemagne avant 1945
1261 +xe,Europe,Union européenne
1262 +xk,Corée avant 1948
1263 +xn,Pays-Bas avant 1830,Belgique avant 1830
1264 +xx,inconnu
1265 +yd,Yémen (République démocratique populaire),Sud Yémen
1266 +ye,Yémen
1267 +yt,Mayotte
1268 +yu,Yougoslavie
1269 +yy,ne s'applique pas
1270 +za,Afrique du Sud
1271 +zm,Zambie
1272 +zw,Zimbabwe
1273 +zz,multiple
diff --git a/data/stopwords.py b/data/stopwords.py
@@ -0,0 +1,15 @@
1274 +# -*- coding: utf-8 -*-
1275 +"""
1276 +Stopwords in different languages.
1277 +"""
1278 +
1279 +FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
1280 +
1281 +
1282 +ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
1283 +
1284 +
1285 +ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
1286 +
1287 +
1288 +ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
diff --git a/data/us_states.py b/data/us_states.py
@@ -0,0 +1,211 @@
1289 +# -*- coding: utf-8 -*-
1290 +
1291 +# See http://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations
1292 +# WARNING: The name of each state should be in French
1293 +# (e.g. "Floride", not "Florida")
1294 +US_STATES = {'AK': 'Alaska',
1295 +             'AL': 'Alabama',
1296 +             'AR': 'Arkansas',
1297 +             'AZ': 'Arizona',
1298 +             'Ala.': 'Alabama',
1299 +             'Alas.': 'Alaska',
1300 +             'Alaska': 'Alaska',
1301 +             'Ariz.': 'Arizona',
1302 +             'Ark.': 'Arkansas',
1303 +             'Az.': 'Arizona',
1304 +             'CA': 'Californie',
1305 +             'CF': 'Californie',
1306 +             'CL': 'Colorado',
1307 +             'CO': 'Colorado',
1308 +             'CT': 'Connecticut',
1309 +             'Ca.': 'Californie',
1310 +             'Cal.': 'Californie',
1311 +             'Cali.': 'Californie',
1312 +             'Calif.': 'Californie',
1313 +             'Col.': 'Colorado',
1314 +             'Colo.': 'Colorado',
1315 +             'Conn.': 'Connecticut',
1316 +             'Ct.': 'Connecticut',
1317 +             'D.C.': 'District of ColuFederal district',
1318 +             'DC': 'District of ColuFederal district',
1319 +             'DE': 'Delaware',
1320 +             'DL': 'Delaware',
1321 +             'De.': 'Delaware',
1322 +             'Del.': 'Delaware',
1323 +             'FL': 'Floride',
1324 +             'Fl.': 'Floride',
1325 +             'Fla.': 'Floride',
1326 +             'Flor.': 'Floride',
1327 +             'GA': u'Géorgie',
1328 +             'Ga.': u'Géorgie',
1329 +             'H.I.': 'Hawaii',
1330 +             'HA': 'Hawaii',
1331 +             'HI': 'Hawaii',
1332 +             'Hawaii': 'Hawaii',
1333 +             'IA': 'Iowa',
1334 +             'ID': 'Idaho',
1335 +             'IL': 'Illinois',
1336 +             'IN': 'Indiana',
1337 +             'Ia.': 'Iowa',
1338 +             'Id.': 'Idaho',
1339 +             'Ida.': 'Idaho',
1340 +             'Idaho': 'Idaho',
1341 +             'Il.': 'Illinois',
1342 +             "Ill's": 'Illinois',
1343 +             'Ill.': 'Illinois',
1344 +             'Ills.': 'Illinois',
1345 +             'In.': 'Indiana',
1346 +             'Ind.': 'Indiana',
1347 +             'Ioa.': 'Iowa',
1348 +             'Iowa': 'Iowa',
1349 +             'KA': 'Kansas',
1350 +             'KS': 'Kansas',
1351 +             'KY': 'Kentucky',
1352 +             'Ka.': 'Kansas',
1353 +             'Kan.': 'Kansas',
1354 +             'Kans.': 'Kansas',
1355 +             'Ks.': 'Kansas',
1356 +             'Ky.': 'Kentucky',
1357 +             'LA': 'Louisiane',
1358 +             'La.': 'Louisiane',
1359 +             'MA': 'Massachusetts',
1360 +             'MC': 'Michigan',
1361 +             'MD': 'Maryland',
1362 +             'ME': 'Maine',
1363 +             'MI': 'Mississippi',
1364 +             'MN': 'Minnesota',
1365 +             'MO': 'Missouri',
1366 +             'MS': 'Mississippi',
1367 +             'MT': 'Montana',
1368 +             'Maine': 'Maine',
1369 +             'Mass.': 'Massachusetts',
1370 +             'Md.': 'Maryland',
1371 +             'Me.': 'Maine',
1372 +             'Mich.': 'Michigan',
1373 +             'Minn.': 'Minnesota',
1374 +             'Miss.': 'Mississippi',
1375 +             'Mn.': 'Minnesota',
1376 +             'Mo.': 'Missouri',
1377 +             'Mont.': 'Montana',
1378 +             'N. Car.': 'Caroline du Nord',
1379 +             'N. Dak.': 'Dakota du Nord',
1380 +             'N. Mex.': 'Nouveau-Mexique',
1381 +             'N. York': 'New York',
1382 +             'N.C.': 'Caroline du Nord',
1383 +             'N.D.': 'Dakota du Nord',
1384 +             'N.H.': 'New Hampshire',
1385 +             'N.J.': 'New Jersey',
1386 +             'N.M.': 'Nouveau-Mexique',
1387 +             'N.Y.': 'New York',
1388 +             'NB': 'Nebraska',
1389 +             'NC': 'Caroline du Nord',
1390 +             'ND': 'Dakota du Nord',
1391 +             'NE': 'Nebraska',
1392 +             'NH': 'New Hampshire',
1393 +             'NJ': 'New Jersey',
1394 +             'NM': 'Nouveau-Mexique',
1395 +             'NV': 'Nevada',
1396 +             'NY': 'New York',
1397 +             'Neb.': 'Nebraska',
1398 +             'Nebr.': 'Nebraska',
1399 +             'Nev.': 'Nevada',
1400 +             'New M.': 'Nouveau-Mexique',
1401 +             'NoDak': 'Dakota du Nord',
1402 +             'Nv.': 'Nevada',
1403 +             'O.': 'Ohio',
1404 +             'OH': 'Ohio',
1405 +             'OK': 'Oklahoma',
1406 +             'OR': 'Oregon',
1407 +             'Oh.': 'Ohio',
1408 +             'Ohio': 'Ohio',
1409 +             'Ok.': 'Oklahoma',
1410 +             'Okla.': 'Oklahoma',
1411 +             'Or.': 'Oregon',
1412 +             'Ore.': 'Oregon',
1413 +             'Oreg.': 'Oregon',
1414 +             'PA': 'Pennsylvanie',
1415 +             'Pa.': 'Pennsylvanie',
1416 +             'R.I.': 'Rhode Island',
1417 +             'R.I. & P.P.': 'Rhode Island',
1418 +             'RI': 'Rhode Island',
1419 +             'S. Car.': 'Caroline du Sud',
1420 +             'S. Dak.': 'Dakota du Sud',
1421 +             'S.C.': 'Caroline du Sud',
1422 +             'S.D.': 'Dakota du Sud',
1423 +             'SC': 'Caroline du Sud',
1424 +             'SD': 'Dakota du Sud',
1425 +             'SoDak': 'Dakota du Sud',
1426 +             'State': 'Utah',
1427 +             'TN': 'Tennessee',
1428 +             'TX': 'Texas',
1429 +             'Tenn.': 'Tennessee',
1430 +             'Tex.': 'Texas',
1431 +             'Texas': 'Texas',
1432 +             'Tn.': 'Tennessee',
1433 +             'Tx.': 'Texas',
1434 +             'US-AL': 'Alabama',
1435 +             'US-AR': 'Arkansas',
1436 +             'US-AZ': 'Arizona',
1437 +             'US-CA': 'Californie',
1438 +             'US-CO': 'Colorado',
1439 +             'US-CT': 'Connecticut',
1440 +             'US-DC': 'District of ColuFederal district',
1441 +             'US-DE': 'Delaware',
1442 +             'US-FL': 'Floride',
1443 +             'US-GA': u'Géorgie',
1444 +             'US-IL': 'Illinois',
1445 +             'US-IN': 'Indiana',
1446 +             'US-KY': 'Kentucky',
1447 +             'US-LA': 'Louisiane',
1448 +             'US-MA': 'Massachusetts',
1449 +             'US-MD': 'Maryland',
1450 +             'US-MI': 'Michigan',
1451 +             'US-MN': 'Minnesota',
1452 +             'US-MO': 'Missouri',
1453 +             'US-MS': 'Mississippi',
1454 +             'US-MT': 'Montana',
1455 +             'US-NC': 'Caroline du Nord',
1456 +             'US-ND': 'Dakota du Nord',
1457 +             'US-NE': 'Nebraska',
1458 +             'US-NH': 'New Hampshire',
1459 +             'US-NJ': 'New Jersey',
1460 +             'US-NM': 'Nouveau-Mexique',
1461 +             'US-NY': 'New York',
1462 +             'US-OK': 'Oklahoma',
1463 +             'US-PA': 'Pennsylvanie',
1464 +             'US-RI': 'Rhode Island',
1465 +             'US-SC': 'Caroline du Sud',
1466 +             'US-SD': 'Dakota du Sud',
1467 +             'US-TN': 'Tennessee',
1468 +             'US-VA': 'Virginia',
1469 +             'US-VT': 'Vermont',
1470 +             'US-WA': 'Washington',
1471 +             'US-WI': 'Wisconsin',
1472 +             'US-WV': 'Virginie occidentale',
1473 +             'US-WY': 'Wyoming',
1474 +             'UT': 'Utah',
1475 +             'Ut.': 'Utah',
1476 +             'Utah': 'Utah',
1477 +             'VA': 'Virginia',
1478 +             'VT': 'Vermont',
1479 +             'Va.': 'Virginia',
1480 +             'Vt.': 'Vermont',
1481 +             'W. Va.': 'Virginie occidentale',
1482 +             'W. Virg.': 'Virginie occidentale',
1483 +             'W.V.': 'Virginie occidentale',
1484 +             'W.Va.': 'Virginie occidentale',
1485 +             'WA': 'Washington',
1486 +             'WI': 'Wisconsin',
1487 +             'WN': 'Washington',
1488 +             'WS': 'Wisconsin',
1489 +             'WV': 'Virginie occidentale',
1490 +             'WY': 'Wyoming',
1491 +             'Wa.': 'Washington',
1492 +             'Wash.': 'Washington',
1493 +             'Wash. D.C.': 'District of ColuFederal district',
1494 +             'Wi.': 'Wisconsin',
1495 +             'Wis.': 'Wisconsin',
1496 +             'Wisc.': 'Wisconsin',
1497 +             'Wn.': 'Washington',
1498 +             'Wy.': 'Wyoming',
1499 +             'Wyo.': 'Wyoming'}
diff --git a/demo.py b/demo.py
@@ -1,197 +0,0 @@
1500 -#!/usr/bin/python
1501 -#-*- coding:utf-8 -*-
1502 -
1503 -from os import path
1504 -
1505 -import urllib
1506 -
1507 -import nazca.distances as ald
1508 -import nazca.normalize as aln
1509 -from nazca.aligner import align, subalign, findneighbours, alignall
1510 -from nazca.dataio import parsefile, sparqlquery, write_results
1511 -
1512 -DEMODIR = path.dirname(__file__)
1513 -
1514 -def dpath(filename):
1515 -    return path.join(DEMODIR, 'demo', filename)
1516 -
1517 -def remove_after(string, sub):
1518 -    try:
1519 -        return string[:string.lower().index(sub)].strip()
1520 -    except ValueError:
1521 -        return string
1522 -
1523 -def parserql(host, rql):
1524 -    filehandle = urllib.urlopen('%(host)sview?'
1525 -                                'rql=%(rql)s&vid=csvexport'
1526 -                                % {'rql': rql, 'host': host})
1527 -    filehandle.readline()
1528 -    rset = [[e.decode('utf-8') for e in line.strip().split(';')]
1529 -            for line in filehandle]
1530 -    return rset
1531 -
1532 -def demo_0():
1533 -    # prixgoncourt is the list of Goncourt Prize, extracted
1534 -    # from wikipedia
1535 -
1536 -    #We try to align Goncourt winers onto dbpedia results
1537 -
1538 -    query = """
1539 -       SELECT ?writer, ?name WHERE {
1540 -          ?writer  <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:French_novelists>.
1541 -          ?writer rdfs:label ?name.
1542 -          FILTER(lang(?name) = 'fr')
1543 -       }
1544 -    """
1545 -
1546 -    print "Sending query to dbpedia"
1547 -    targetset = sparqlquery('http://dbpedia.org/sparql', query)
1548 -    print "Reading the prixgoncourt file"
1549 -    alignset = parsefile(dpath('prixgoncourt'), indexes=[1, 1])
1550 -
1551 -    tr_name = {'normalization': [lambda x:remove_after(x, '('),
1552 -                                 aln.simplify],
1553 -               'metric': ald.levenshtein
1554 -              }
1555 -
1556 -    processings = {1: tr_name}
1557 -
1558 -    print "Alignment started"
1559 -    align(alignset, targetset, 0.4, processings,
1560 -          dpath('demo0_results'))
1561 -
1562 -    print "Done, see the resuls in %s" % dpath('demo0_results')
1563 -
1564 -def demo_1():
1565 -    # FR.txt is an extract of geonames, where locations have been sorted by name
1566 -    # frenchbnf is an extract of french BNF's locations, sorted by name too
1567 -
1568 -    # For each line (ie location) we keep the identifier, the name and the
1569 -    # position (longitude, latitude)
1570 -    # ``nbmax`` is the number of locations to load
1571 -
1572 -    print "Parsing the input files"
1573 -    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)],
1574 -                          nbmax=2000)
1575 -    alignset = parsefile(dpath('frenchbnf'),
1576 -                         indexes=[0, 2, (14, 12)], nbmax=1000)
1577 -
1578 -
1579 -    # Let's define the processings to apply on the location's name
1580 -    tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
1581 -                                              #   punctuation, lower case, etc)
1582 -               'metric': ald.levenshtein,       # Use the levenshtein distance
1583 -               'weighting': 1                 # Use 1 a name-distance matrix
1584 -                                              #   weighting coefficient
1585 -              }
1586 -    tr_geo = {'normalization': [],              # No normalization needed
1587 -              'metric': ald.geographical,         # Use the geographical distance
1588 -              'metric_params': {'units': 'km'},# Arguments given the
1589 -                                                #   distance function. Here,
1590 -                                                #   the unit to use
1591 -              'weighting': 1
1592 -             }
1593 -
1594 -    processings = {1: tr_name, 2: tr_geo}
1595 -
1596 -    print "Alignment started"
1597 -    align(alignset,           # The dataset to align
1598 -          targetset,          # The target dataset
1599 -          0.4,                # The maximal distance
1600 -                              #   threshold
1601 -          processings,         # The list of processings to
1602 -                              #   apply.
1603 -          dpath('demo1_results'))
1604 -                              # Filename of the output
1605 -                              #   result file
1606 -    # the ``align()`` function return two items
1607 -    # 0. the computed distance matrix
1608 -    # 1. a boolean, True if at least one alignment has been done, False
1609 -    #    otherwise
1610 -    print "Done, see the results in %s" % dpath('demo1_results')
1611 -
1612 -def demo_2():
1613 -    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)],
1614 -                          formatopt={1:lambda x:x.decode('utf-8')})
1615 -    alignset = parsefile(dpath('frenchbnf'), indexes=[0, 2, (14, 12)],
1616 -                         formatopt={2:lambda x:x.decode('utf-8')}, nbmax=30000)
1617 -
1618 -    print "Finding neighbours"
1619 -    neighbours = findneighbours(alignset, targetset, indexes=(2, 2),
1620 -                               mode='minibatch')
1621 -
1622 -    # Let's define the processings to apply on the location's name
1623 -    tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
1624 -                                              #   punctuation, lower case, etc)
1625 -               'metric': ald.levenshtein,     # Use the levenshtein distance
1626 -               'weighting': 1                 # Use 1 a name-distance matrix
1627 -                                              #   weighting coefficient
1628 -              }
1629 -
1630 -    processings = {1: tr_name}
1631 -    print "Start computation"
1632 -    for ind, (alignid, targetid) in enumerate(neighbours):
1633 -        print '%3d' % ind, len(alignid), 'x', len(targetid)
1634 -        _, matched = subalign(alignset,   # The dataset to align
1635 -                              targetset,  # The target dataset
1636 -                              alignid,
1637 -                              targetid,
1638 -                              0.3,
1639 -                              processings)
1640 -        write_results(matched, alignset, targetset, dpath('demo2_results'))
1641 -    print "Done, see the results in %s" % dpath('demo2_results')
1642 -
1643 -def demo_3():
1644 -    print "Parsing files"
1645 -    alignset = parserql(host='http://demo.cubicweb.org/elections/',
1646 -                        rql='Any E, N WHERE X is Commune, X eid E, X label N')
1647 -    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1])
1648 -    print '%s×%s' % (len(alignset), len(targetset))
1649 -
1650 -    tr_name = {'normalization': [aln.simplify],
1651 -               'metric': 'levenshtein'
1652 -              }
1653 -
1654 -    print "Alignment started"
1655 -    results = alignall(alignset, targetset, 0.75, processings={1: tr_name},
1656 -                       indexes=(1,1), mode='minhashing', kwordsgram=1, siglen=200,
1657 -                       uniq=True)
1658 -    dicresults = dict([(a, b) for (a, b) in results])
1659 -
1660 -    print "Done, writing output"
1661 -
1662 -    with open(dpath('demo3_results'), 'w') as fout:
1663 -        for line in alignset:
1664 -            sent = u'http://demo.cubicweb.org/elections/commune/%s;'\
1665 -                   u'http://www.geonames.org/%s\n' \
1666 -                   % (line[0], dicresults.get(line[0], 'not_found'))
1667 -            fout.write(sent.encode('utf-8'))
1668 -
1669 -    print "See the results in %s" % dpath('demo3_results')
1670 -
1671 -if __name__ == '__main__':
1672 -    import sys
1673 -    from time import time
1674 -    runall = (len(sys.argv) == 1)
1675 -
1676 -    t = time()
1677 -    if runall or '0' in sys.argv:
1678 -        print "Running demo_0"
1679 -        demo_0()
1680 -
1681 -    if runall or '1' in sys.argv:
1682 -        print "Running demo_1"
1683 -        demo_1()
1684 -
1685 -    if runall or '2' in sys.argv:
1686 -        print "Running demo_2"
1687 -        ## Same as demo_1, but in a more efficient way, using a method to find
1688 -        ## neighbours
1689 -        demo_2()
1690 -
1691 -    if runall or '3' in sys.argv:
1692 -        print "Running demo_3"
1693 -        demo_3()
1694 -
1695 -    print "Demo terminated"
1696 -    print "Took %d min" % ((time() - t)/60.)
diff --git a/examples/demo.py b/examples/demo.py
@@ -0,0 +1,197 @@
1697 +#!/usr/bin/python
1698 +#-*- coding:utf-8 -*-
1699 +
1700 +from os import path
1701 +
1702 +import urllib
1703 +
1704 +import nazca.distances as ald
1705 +import nazca.normalize as aln
1706 +from nazca.aligner import align, subalign, findneighbours, alignall
1707 +from nazca.dataio import parsefile, sparqlquery, write_results
1708 +
1709 +DEMODIR = path.dirname(__file__)
1710 +
1711 +def dpath(filename):
1712 +    return path.join(DEMODIR, 'demo', filename)
1713 +
1714 +def remove_after(string, sub):
1715 +    try:
1716 +        return string[:string.lower().index(sub)].strip()
1717 +    except ValueError:
1718 +        return string
1719 +
1720 +def parserql(host, rql):
1721 +    filehandle = urllib.urlopen('%(host)sview?'
1722 +                                'rql=%(rql)s&vid=csvexport'
1723 +                                % {'rql': rql, 'host': host})
1724 +    filehandle.readline()
1725 +    rset = [[e.decode('utf-8') for e in line.strip().split(';')]
1726 +            for line in filehandle]
1727 +    return rset
1728 +
1729 +def demo_0():
1730 +    # prixgoncourt is the list of Goncourt Prize, extracted
1731 +    # from wikipedia
1732 +
1733 +    #We try to align Goncourt winers onto dbpedia results
1734 +
1735 +    query = """
1736 +       SELECT ?writer, ?name WHERE {
1737 +          ?writer  <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:French_novelists>.
1738 +          ?writer rdfs:label ?name.
1739 +          FILTER(lang(?name) = 'fr')
1740 +       }
1741 +    """
1742 +
1743 +    print "Sending query to dbpedia"
1744 +    targetset = sparqlquery('http://dbpedia.org/sparql', query)
1745 +    print "Reading the prixgoncourt file"
1746 +    alignset = parsefile(dpath('prixgoncourt'), indexes=[1, 1])
1747 +
1748 +    tr_name = {'normalization': [lambda x:remove_after(x, '('),
1749 +                                 aln.simplify],
1750 +               'metric': ald.levenshtein
1751 +              }
1752 +
1753 +    processings = {1: tr_name}
1754 +
1755 +    print "Alignment started"
1756 +    align(alignset, targetset, 0.4, processings,
1757 +          dpath('demo0_results'))
1758 +
1759 +    print "Done, see the resuls in %s" % dpath('demo0_results')
1760 +
1761 +def demo_1():
1762 +    # FR.txt is an extract of geonames, where locations have been sorted by name
1763 +    # frenchbnf is an extract of french BNF's locations, sorted by name too
1764 +
1765 +    # For each line (ie location) we keep the identifier, the name and the
1766 +    # position (longitude, latitude)
1767 +    # ``nbmax`` is the number of locations to load
1768 +
1769 +    print "Parsing the input files"
1770 +    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)],
1771 +                          nbmax=2000)
1772 +    alignset = parsefile(dpath('frenchbnf'),
1773 +                         indexes=[0, 2, (14, 12)], nbmax=1000)
1774 +
1775 +
1776 +    # Let's define the processings to apply on the location's name
1777 +    tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
1778 +                                              #   punctuation, lower case, etc)
1779 +               'metric': ald.levenshtein,       # Use the levenshtein distance
1780 +               'weighting': 1                 # Use 1 a name-distance matrix
1781 +                                              #   weighting coefficient
1782 +              }
1783 +    tr_geo = {'normalization': [],              # No normalization needed
1784 +              'metric': ald.geographical,         # Use the geographical distance
1785 +              'metric_params': {'units': 'km'},# Arguments given the
1786 +                                                #   distance function. Here,
1787 +                                                #   the unit to use
1788 +              'weighting': 1
1789 +             }
1790 +
1791 +    processings = {1: tr_name, 2: tr_geo}
1792 +
1793 +    print "Alignment started"
1794 +    align(alignset,           # The dataset to align
1795 +          targetset,          # The target dataset
1796 +          0.4,                # The maximal distance
1797 +                              #   threshold
1798 +          processings,         # The list of processings to
1799 +                              #   apply.
1800 +          dpath('demo1_results'))
1801 +                              # Filename of the output
1802 +                              #   result file
1803 +    # the ``align()`` function return two items
1804 +    # 0. the computed distance matrix
1805 +    # 1. a boolean, True if at least one alignment has been done, False
1806 +    #    otherwise
1807 +    print "Done, see the results in %s" % dpath('demo1_results')
1808 +
1809 +def demo_2():
1810 +    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)],
1811 +                          formatopt={1:lambda x:x.decode('utf-8')})
1812 +    alignset = parsefile(dpath('frenchbnf'), indexes=[0, 2, (14, 12)],
1813 +                         formatopt={2:lambda x:x.decode('utf-8')}, nbmax=30000)
1814 +
1815 +    print "Finding neighbours"
1816 +    neighbours = findneighbours(alignset, targetset, indexes=(2, 2),
1817 +                               mode='minibatch')
1818 +
1819 +    # Let's define the processings to apply on the location's name
1820 +    tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
1821 +                                              #   punctuation, lower case, etc)
1822 +               'metric': ald.levenshtein,     # Use the levenshtein distance
1823 +               'weighting': 1                 # Use 1 a name-distance matrix
1824 +                                              #   weighting coefficient
1825 +              }
1826 +
1827 +    processings = {1: tr_name}
1828 +    print "Start computation"
1829 +    for ind, (alignid, targetid) in enumerate(neighbours):
1830 +        print '%3d' % ind, len(alignid), 'x', len(targetid)
1831 +        _, matched = subalign(alignset,   # The dataset to align
1832 +                              targetset,  # The target dataset
1833 +                              alignid,
1834 +                              targetid,
1835 +                              0.3,
1836 +                              processings)
1837 +        write_results(matched, alignset, targetset, dpath('demo2_results'))
1838 +    print "Done, see the results in %s" % dpath('demo2_results')
1839 +
1840 +def demo_3():
1841 +    print "Parsing files"
1842 +    alignset = parserql(host='http://demo.cubicweb.org/elections/',
1843 +                        rql='Any E, N WHERE X is Commune, X eid E, X label N')
1844 +    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1])
1845 +    print '%s×%s' % (len(alignset), len(targetset))
1846 +
1847 +    tr_name = {'normalization': [aln.simplify],
1848 +               'metric': 'levenshtein'
1849 +              }
1850 +
1851 +    print "Alignment started"
1852 +    results = alignall(alignset, targetset, 0.75, processings={1: tr_name},
1853 +                       indexes=(1,1), mode='minhashing', kwordsgram=1, siglen=200,
1854 +                       uniq=True)
1855 +    dicresults = dict([(a, b) for (a, b) in results])
1856 +
1857 +    print "Done, writing output"
1858 +
1859 +    with open(dpath('demo3_results'), 'w') as fout:
1860 +        for line in alignset:
1861 +            sent = u'http://demo.cubicweb.org/elections/commune/%s;'\
1862 +                   u'http://www.geonames.org/%s\n' \
1863 +                   % (line[0], dicresults.get(line[0], 'not_found'))
1864 +            fout.write(sent.encode('utf-8'))
1865 +
1866 +    print "See the results in %s" % dpath('demo3_results')
1867 +
1868 +if __name__ == '__main__':
1869 +    import sys
1870 +    from time import time
1871 +    runall = (len(sys.argv) == 1)
1872 +
1873 +    t = time()
1874 +    if runall or '0' in sys.argv:
1875 +        print "Running demo_0"
1876 +        demo_0()
1877 +
1878 +    if runall or '1' in sys.argv:
1879 +        print "Running demo_1"
1880 +        demo_1()
1881 +
1882 +    if runall or '2' in sys.argv:
1883 +        print "Running demo_2"
1884 +        ## Same as demo_1, but in a more efficient way, using a method to find
1885 +        ## neighbours
1886 +        demo_2()
1887 +
1888 +    if runall or '3' in sys.argv:
1889 +        print "Running demo_3"
1890 +        demo_3()
1891 +
1892 +    print "Demo terminated"
1893 +    print "Took %d min" % ((time() - t)/60.)
diff --git a/named_entities/__init__.py b/named_entities/__init__.py
@@ -1,80 +0,0 @@
1894 -# -*- coding: utf-8 -*-
1895 -""" Process/Core functions for Named Entities Recognition.
1896 -"""
1897 -from nazca.utils.tokenizer import RichStringTokenizer
1898 -
1899 -
1900 -###############################################################################
1901 -### NER PROCESS ###############################################################
1902 -###############################################################################
1903 -class NerProcess(object):
1904 -    """ High-level process for Named Entities Recognition
1905 -    """
1906 -
1907 -    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
1908 -        """ Initialise the class.
1909 -
1910 -        :tokenizer: an instance of tokenizer
1911 -        """
1912 -        self.ner_sources = list(ner_sources)
1913 -        self.preprocessors = preprocessors or []
1914 -        self.filters = filters or []
1915 -        self.unique = unique
1916 -
1917 -    def add_ner_source(self, process):
1918 -        """ Add a ner process
1919 -        """
1920 -        self.ner_sources.append(process)
1921 -
1922 -    def add_preprocessors(self, preprocessor):
1923 -        """ Add a preprocessor
1924 -        """
1925 -        self.preprocessors.append(preprocessor)
1926 -
1927 -    def add_filters(self, filter):
1928 -        """ Add a filter
1929 -        """
1930 -        self.filters.append(filter)
1931 -
1932 -    def process_text(self, text):
1933 -        """ High level function for analyzing a text
1934 -        """
1935 -        tokenizer = RichStringTokenizer(text)
1936 -        return self.recognize_tokens(tokenizer)
1937 -
1938 -    def recognize_tokens(self, tokens):
1939 -        """ Recognize Named Entities from a tokenizer or
1940 -        an iterator yielding tokens.
1941 -        """
1942 -        last_stop = 0
1943 -        named_entities = []
1944 -        for token in tokens:
1945 -            if token.start < last_stop:
1946 -                continue # this token overlaps with a previous match
1947 -            word = token.word
1948 -            # Applies preprocessors
1949 -            # XXX Preprocessors may be sources dependant
1950 -            for preprocessor in self.preprocessors:
1951 -                token = preprocessor(token)
1952 -                if not token:
1953 -                    break
1954 -            if not token:
1955 -                continue
1956 -            recognized = False
1957 -            for process in self.ner_sources:
1958 -                for uri in process.recognize_token(token):
1959 -                    named_entities.append((uri, process.name, token))
1960 -                    recognized = True
1961 -                    last_stop = token.end
1962 -                    if self.unique:
1963 -                        break
1964 -                if recognized and self.unique:
1965 -                    break
1966 -        # XXX Postprocess/filters may be sources dependant
1967 -        return self.postprocess(named_entities)
1968 -
1969 -    def postprocess(self, named_entities):
1970 -        """ Postprocess the results by applying filters """
1971 -        for filter in self.filters:
1972 -            named_entities = filter(named_entities)
1973 -        return named_entities
diff --git a/named_entities/filters.py b/named_entities/filters.py
@@ -1,103 +0,0 @@
1974 -# -*- coding: utf-8 -*-
1975 -""" Filters for Named Entities Recognition.
1976 -"""
1977 -from nazca.utils.dataio import sparqlquery
1978 -
1979 -
1980 -###############################################################################
1981 -### NER FILTERS ###############################################################
1982 -###############################################################################
1983 -class AbstractNerFilter(object):
1984 -    """ A filter used for cleaning named entities results
1985 -    """
1986 -
1987 -    def __call__(self, named_entities):
1988 -        raise NotImplementedError
1989 -
1990 -
1991 -class NerOccurenceFilter(object):
1992 -    """ A filter based on the number of occurence of
1993 -    named entities in the results.
1994 -    """
1995 -    def __init__(self, min_occ=None, max_occ=None):
1996 -        self.min_occ = min_occ
1997 -        self.max_occ = max_occ
1998 -
1999 -    def __call__(self, named_entities):
2000 -        uris = [u for u, p, t in named_entities]
2001 -        counts = dict([(u, uris.count(u)) for u in set(uris)])
2002 -        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
2003 -                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
2004 -
2005 -
2006 -class NerRDFTypeFilter(object):
2007 -    """ A filter based on the RDF type on entity
2008 -    E.g.
2009 -
2010 -    filter = NerRDFTypeFilter('http://dbpedia.org/sparql',
2011 -                                ('http://schema.org/Place',
2012 -                                'http://dbpedia.org/ontology/Agent',
2013 -                                'http://dbpedia.org/ontology/Place'))
2014 -
2015 -    """
2016 -    def __init__(self, endpoint, accepted_types):
2017 -        self.endpoint = endpoint
2018 -        self.accepted_types = accepted_types
2019 -        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
2020 -
2021 -    def __call__(self, named_entities):
2022 -        filtered_named_entities = []
2023 -        seen_uris = {}
2024 -        for uri, p, t in named_entities:
2025 -            if uri in seen_uris:
2026 -                if seen_uris[uri]:
2027 -                    filtered_named_entities.append((uri, p, t))
2028 -            else:
2029 -                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
2030 -                types = set([r['type']['value'] for r in results])
2031 -                if not len(types.intersection(self.accepted_types)):
2032 -                    seen_uris[uri] = False
2033 -                else:
2034 -                    seen_uris[uri] = True
2035 -                    filtered_named_entities.append((uri, p, t))
2036 -        return filtered_named_entities
2037 -
2038 -
2039 -class NerDisambiguationWordParts(object):
2040 -    """ Disambiguate named entities based on the words parts.
2041 -    E.g.:
2042 -          'toto tutu': 'http://example.com/toto_tutu',
2043 -          'toto': 'http://example.com/toto'
2044 -
2045 -          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
2046 -          by 'http://example.com/toto_tutu'
2047 -    """
2048 -    def __call__(self, named_entities):
2049 -        # Create parts dictionnary
2050 -        parts = {}
2051 -        for uri, peid, token in named_entities:
2052 -            if ' ' in token.word:
2053 -                for part in token.word.split(' '):
2054 -                    parts[part.lower()] = uri
2055 -        # Replace named entities
2056 -        filtered_named_entities = []
2057 -        for uri, peid, token in named_entities:
2058 -            if token.word in parts:
2059 -                # Change URI
2060 -                uri = parts[token.word]
2061 -            filtered_named_entities.append((uri, peid, token))
2062 -        return filtered_named_entities
2063 -
2064 -
2065 -class NerReplacementRulesFilter(object):
2066 -    """ Allow to define replacement rules for Named Entities
2067 -    """
2068 -    def __init__(self,rules):
2069 -        self.rules = rules
2070 -
2071 -    def __call__(self, named_entities):
2072 -        filtered_named_entities = []
2073 -        for uri, peid, token in named_entities:
2074 -            uri = self.rules.get(uri, uri)
2075 -            filtered_named_entities.append((uri, peid, token))
2076 -        return filtered_named_entities
diff --git a/named_entities/preprocessors.py b/named_entities/preprocessors.py
@@ -1,83 +0,0 @@
2077 -# -*- coding: utf-8 -*-
2078 -""" Preprocessors for Named Entities Recognition.
2079 -"""
2080 -from nazca.utils.tokenizer import Token
2081 -from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
2082 -
2083 -STOPWORDS = {'fr': FRENCH_STOPWORDS,
2084 -             'en': ENGLISH_STOPWORDS}
2085 -
2086 -
2087 -###############################################################################
2088 -### NER PREPROCESSORS #########################################################
2089 -###############################################################################
2090 -class AbstractNerPreprocessor(object):
2091 -    """ Preprocessor
2092 -    """
2093 -
2094 -    def __call__(self, token):
2095 -        raise NotImplementedError
2096 -
2097 -
2098 -class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
2099 -    """ Remove token based on the size of the word
2100 -    """
2101 -    def __init__(self, min_size=None, max_size=None):
2102 -        self.min_size = min_size
2103 -        self.max_size = max_size
2104 -
2105 -    def __call__(self, token):
2106 -        if ((self.min_size and len(token.word)<self.min_size)
2107 -            or (self.max_size and len(token.word)>self.max_size)):
2108 -            return None
2109 -        return token
2110 -
2111 -
2112 -class NerLowerCaseFilterPreprocessor(AbstractNerPreprocessor):
2113 -    """ Remove token with word in lower case
2114 -    """
2115 -
2116 -    def __call__(self, token):
2117 -        return None if token.word.islower() else token
2118 -
2119 -
2120 -class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
2121 -    """ Lower the first word of each sentence if it is a stopword.
2122 -    """
2123 -    def __init__(self, lang='en'):
2124 -        self.lang = lang
2125 -
2126 -    def __call__(self, token):
2127 -        if (token.start == token.sentence.start and
2128 -            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
2129 -            word = token.word[0].lower() + token.word[1:]
2130 -            return Token(word, token.start, token.end, token.sentence)
2131 -        return token
2132 -
2133 -
2134 -class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
2135 -    """ Remove stopwords
2136 -    """
2137 -    def __init__(self, split_words=False, lang='en'):
2138 -        self.split_words = split_words
2139 -        self.lang = lang
2140 -
2141 -    def __call__(self, token):
2142 -        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
2143 -        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
2144 -            return None
2145 -        if not self.split_words and token.word.lower() in stopwords:
2146 -            return None
2147 -        return token
2148 -
2149 -
2150 -class NerHashTagPreprocessor(AbstractNerPreprocessor):
2151 -    """ Cleanup hashtag
2152 -    """
2153 -    def __call__(self, token):
2154 -        if token.word.startswith('@'):
2155 -            # XXX Split capitalize letter ?
2156 -            # @BarackObama -> Barack Obama
2157 -            word = token.word[1:].replace('_', ' ')
2158 -            return Token(word, token.start, token.end, token.sentence)
2159 -        return token
diff --git a/named_entities/sources.py b/named_entities/sources.py
@@ -1,124 +0,0 @@
2160 -# -*- coding: utf-8 -*-
2161 -""" Sources for Named Entities Recognition.
2162 -"""
2163 -from nazca.utils.tokenizer import Token
2164 -from nazca.utils.dataio import sparqlquery, rqlquery
2165 -
2166 -
2167 -###############################################################################
2168 -### NER SOURCE ################################################################
2169 -###############################################################################
2170 -class AbstractNerSource(object):
2171 -    """ High-level source for Named Entities Recognition
2172 -    """
2173 -
2174 -    def __init__(self, endpoint, query, name=None, use_cache=True, preprocessors=None):
2175 -        """ Initialise the class.
2176 -        """
2177 -        self.endpoint = endpoint
2178 -        self.query = query
2179 -        self.name = name
2180 -        self.preprocessors = preprocessors or []
2181 -        self.use_cache = use_cache
2182 -        self._recognized_cache = {}
2183 -
2184 -    def add_preprocessors(self, preprocessor):
2185 -        """ Add a preprocessor
2186 -        """
2187 -        self.preprocessors.append(preprocessor)
2188 -
2189 -    def recognize_token(self, token):
2190 -        """ Recognize a token
2191 -        """
2192 -        # Applies source specific preprocessors
2193 -        for preprocessor in self.preprocessors:
2194 -            token = preprocessor(token)
2195 -            if not token:
2196 -                return []
2197 -        if self.use_cache and token.word in self._recognized_cache:
2198 -            return self._recognized_cache[token.word]
2199 -        uris = self.query_word(token.word) if token.word else []
2200 -        if self.use_cache:
2201 -            self._recognized_cache[token.word] = uris
2202 -        return uris
2203 -
2204 -    def query_word(self, word):
2205 -        """ Query a word for a Named Entities Recognition process
2206 -        """
2207 -        raise NotImplementedError
2208 -
2209 -
2210 -class NerSourceLexicon(AbstractNerSource):
2211 -    """ Source based on a (pre-computed) dictionnary of words (token, uri)
2212 -    """
2213 -    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
2214 -        self.lexicon = lexicon
2215 -        self.name = name
2216 -        self.preprocessors = preprocessors or []
2217 -        self.use_cache = use_cache
2218 -        self._recognized_cache = {}
2219 -
2220 -    def query_word(self, word):
2221 -        uri = self.lexicon.get(word)
2222 -        return [uri,] if uri else []
2223 -
2224 -
2225 -class NerSourceLocalRql(AbstractNerSource):
2226 -    """ High-level source for Named Entities Recognition
2227 -    Local RQL version
2228 -    """
2229 -
2230 -    def __init__(self, session, query, name=None, use_cache=True, preprocessors=None):
2231 -        """ Initialise the class.
2232 -        """
2233 -        self.query = query
2234 -        self.session = session
2235 -        self.name = name
2236 -        self.preprocessors = preprocessors or []
2237 -        self.use_cache = use_cache
2238 -        self._recognized_cache = {}
2239 -
2240 -    def query_word(self, word):
2241 -        """ Query a word for a Named Entities Recognition process
2242 -        """
2243 -        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
2244 -
2245 -
2246 -class NerSourceRql(AbstractNerSource):
2247 -    """ High-level source for Named Entities Recognition
2248 -    Url version (distant source)
2249 -    """
2250 -
2251 -    def query_word(self, word):
2252 -        """ Query a word for a Named Entities Recognition process
2253 -        """
2254 -        if self.endpoint.startswith('http://'):
2255 -            # url
2256 -            return [r[0] for r in rqlquery(self.endpoint, self.query % {'word': word})]
2257 -        else:
2258 -            return [r[0] for r in rqlquery(self.endpoint, self.query, word=word)]
2259 -
2260 -
2261 -class NerSourceSparql(AbstractNerSource):
2262 -    """ High-level source for Named Entities Recognition
2263 -    SPARQL version
2264 -
2265 -   >>> from ner.core import NerSourceSparql
2266 -   >>> ner_source = NerSourceSparql('''SELECT ?uri
2267 -                                         WHERE{
2268 -                                         ?uri rdfs:label "%(word)s"@en}''',
2269 -			                 'http://dbpedia.org/sparql')
2270 -   >>> print ner_source.recognize_token('Victor Hugo')
2271 -		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
2272 -		     'http://dbpedia.org/resource/Victor_Hugo',
2273 -		     'http://dbpedia.org/class/yago/VictorHugo',
2274 -		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
2275 -		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
2276 -		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
2277 -
2278 -    """
2279 -
2280 -    def query_word(self, word):
2281 -        """ Query a word for a Named Entities Recognition process
2282 -        """
2283 -        return [r[0] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
diff --git a/ner/__init__.py b/ner/__init__.py
@@ -0,0 +1,80 @@
2284 +# -*- coding: utf-8 -*-
2285 +""" Process/Core functions for Named Entities Recognition.
2286 +"""
2287 +from nazca.utils.tokenizer import RichStringTokenizer
2288 +
2289 +
2290 +###############################################################################
2291 +### NER PROCESS ###############################################################
2292 +###############################################################################
2293 +class NerProcess(object):
2294 +    """ High-level process for Named Entities Recognition
2295 +    """
2296 +
2297 +    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
2298 +        """ Initialise the class.
2299 +
2300 +        :tokenizer: an instance of tokenizer
2301 +        """
2302 +        self.ner_sources = list(ner_sources)
2303 +        self.preprocessors = preprocessors or []
2304 +        self.filters = filters or []
2305 +        self.unique = unique
2306 +
2307 +    def add_ner_source(self, process):
2308 +        """ Add a ner process
2309 +        """
2310 +        self.ner_sources.append(process)
2311 +
2312 +    def add_preprocessors(self, preprocessor):
2313 +        """ Add a preprocessor
2314 +        """
2315 +        self.preprocessors.append(preprocessor)
2316 +
2317 +    def add_filters(self, filter):
2318 +        """ Add a filter
2319 +        """
2320 +        self.filters.append(filter)
2321 +
2322 +    def process_text(self, text):
2323 +        """ High level function for analyzing a text
2324 +        """
2325 +        tokenizer = RichStringTokenizer(text)
2326 +        return self.recognize_tokens(tokenizer)
2327 +
2328 +    def recognize_tokens(self, tokens):
2329 +        """ Recognize Named Entities from a tokenizer or
2330 +        an iterator yielding tokens.
2331 +        """
2332 +        last_stop = 0
2333 +        named_entities = []
2334 +        for token in tokens:
2335 +            if token.start < last_stop:
2336 +                continue # this token overlaps with a previous match
2337 +            word = token.word
2338 +            # Applies preprocessors
2339 +            # XXX Preprocessors may be sources dependant
2340 +            for preprocessor in self.preprocessors:
2341 +                token = preprocessor(token)
2342 +                if not token:
2343 +                    break
2344 +            if not token:
2345 +                continue
2346 +            recognized = False
2347 +            for process in self.ner_sources:
2348 +                for uri in process.recognize_token(token):
2349 +                    named_entities.append((uri, process.name, token))
2350 +                    recognized = True
2351 +                    last_stop = token.end
2352 +                    if self.unique:
2353 +                        break
2354 +                if recognized and self.unique:
2355 +                    break
2356 +        # XXX Postprocess/filters may be sources dependant
2357 +        return self.postprocess(named_entities)
2358 +
2359 +    def postprocess(self, named_entities):
2360 +        """ Postprocess the results by applying filters """
2361 +        for filter in self.filters:
2362 +            named_entities = filter(named_entities)
2363 +        return named_entities
diff --git a/ner/filters.py b/ner/filters.py
@@ -0,0 +1,103 @@
2364 +# -*- coding: utf-8 -*-
2365 +""" Filters for Named Entities Recognition.
2366 +"""
2367 +from nazca.utils.dataio import sparqlquery
2368 +
2369 +
2370 +###############################################################################
2371 +### NER FILTERS ###############################################################
2372 +###############################################################################
2373 +class AbstractNerFilter(object):
2374 +    """ A filter used for cleaning named entities results
2375 +    """
2376 +
2377 +    def __call__(self, named_entities):
2378 +        raise NotImplementedError
2379 +
2380 +
2381 +class NerOccurenceFilter(object):
2382 +    """ A filter based on the number of occurence of
2383 +    named entities in the results.
2384 +    """
2385 +    def __init__(self, min_occ=None, max_occ=None):
2386 +        self.min_occ = min_occ
2387 +        self.max_occ = max_occ
2388 +
2389 +    def __call__(self, named_entities):
2390 +        uris = [u for u, p, t in named_entities]
2391 +        counts = dict([(u, uris.count(u)) for u in set(uris)])
2392 +        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
2393 +                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
2394 +
2395 +
2396 +class NerRDFTypeFilter(object):
2397 +    """ A filter based on the RDF type on entity
2398 +    E.g.
2399 +
2400 +    filter = NerRDFTypeFilter('http://dbpedia.org/sparql',
2401 +                                ('http://schema.org/Place',
2402 +                                'http://dbpedia.org/ontology/Agent',
2403 +                                'http://dbpedia.org/ontology/Place'))
2404 +
2405 +    """
2406 +    def __init__(self, endpoint, accepted_types):
2407 +        self.endpoint = endpoint
2408 +        self.accepted_types = accepted_types
2409 +        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
2410 +
2411 +    def __call__(self, named_entities):
2412 +        filtered_named_entities = []
2413 +        seen_uris = {}
2414 +        for uri, p, t in named_entities:
2415 +            if uri in seen_uris:
2416 +                if seen_uris[uri]:
2417 +                    filtered_named_entities.append((uri, p, t))
2418 +            else:
2419 +                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
2420 +                types = set([r['type']['value'] for r in results])
2421 +                if not len(types.intersection(self.accepted_types)):
2422 +                    seen_uris[uri] = False
2423 +                else:
2424 +                    seen_uris[uri] = True
2425 +                    filtered_named_entities.append((uri, p, t))
2426 +        return filtered_named_entities
2427 +
2428 +
2429 +class NerDisambiguationWordParts(object):
2430 +    """ Disambiguate named entities based on the words parts.
2431 +    E.g.:
2432 +          'toto tutu': 'http://example.com/toto_tutu',
2433 +          'toto': 'http://example.com/toto'
2434 +
2435 +          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
2436 +          by 'http://example.com/toto_tutu'
2437 +    """
2438 +    def __call__(self, named_entities):
2439 +        # Create parts dictionnary
2440 +        parts = {}
2441 +        for uri, peid, token in named_entities:
2442 +            if ' ' in token.word:
2443 +                for part in token.word.split(' '):
2444 +                    parts[part.lower()] = uri
2445 +        # Replace named entities
2446 +        filtered_named_entities = []
2447 +        for uri, peid, token in named_entities:
2448 +            if token.word in parts:
2449 +                # Change URI
2450 +                uri = parts[token.word]
2451 +            filtered_named_entities.append((uri, peid, token))
2452 +        return filtered_named_entities
2453 +
2454 +
2455 +class NerReplacementRulesFilter(object):
2456 +    """ Allow to define replacement rules for Named Entities
2457 +    """
2458 +    def __init__(self,rules):
2459 +        self.rules = rules
2460 +
2461 +    def __call__(self, named_entities):
2462 +        filtered_named_entities = []
2463 +        for uri, peid, token in named_entities:
2464 +            uri = self.rules.get(uri, uri)
2465 +            filtered_named_entities.append((uri, peid, token))
2466 +        return filtered_named_entities
diff --git a/ner/preprocessors.py b/ner/preprocessors.py
@@ -0,0 +1,83 @@
2467 +# -*- coding: utf-8 -*-
2468 +""" Preprocessors for Named Entities Recognition.
2469 +"""
2470 +from nazca.utils.tokenizer import Token
2471 +from nazca.data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
2472 +
2473 +STOPWORDS = {'fr': FRENCH_STOPWORDS,
2474 +             'en': ENGLISH_STOPWORDS}
2475 +
2476 +
2477 +###############################################################################
2478 +### NER PREPROCESSORS #########################################################
2479 +###############################################################################
2480 +class AbstractNerPreprocessor(object):
2481 +    """ Preprocessor
2482 +    """
2483 +
2484 +    def __call__(self, token):
2485 +        raise NotImplementedError
2486 +
2487 +
2488 +class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
2489 +    """ Remove token based on the size of the word
2490 +    """
2491 +    def __init__(self, min_size=None, max_size=None):
2492 +        self.min_size = min_size
2493 +        self.max_size = max_size
2494 +
2495 +    def __call__(self, token):
2496 +        if ((self.min_size and len(token.word)<self.min_size)
2497 +            or (self.max_size and len(token.word)>self.max_size)):
2498 +            return None
2499 +        return token
2500 +
2501 +
2502 +class NerLowerCaseFilterPreprocessor(AbstractNerPreprocessor):
2503 +    """ Remove token with word in lower case
2504 +    """
2505 +
2506 +    def __call__(self, token):
2507 +        return None if token.word.islower() else token
2508 +
2509 +
2510 +class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
2511 +    """ Lower the first word of each sentence if it is a stopword.
2512 +    """
2513 +    def __init__(self, lang='en'):
2514 +        self.lang = lang
2515 +
2516 +    def __call__(self, token):
2517 +        if (token.start == token.sentence.start and
2518 +            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
2519 +            word = token.word[0].lower() + token.word[1:]
2520 +            return Token(word, token.start, token.end, token.sentence)
2521 +        return token
2522 +
2523 +
2524 +class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
2525 +    """ Remove stopwords
2526 +    """
2527 +    def __init__(self, split_words=False, lang='en'):
2528 +        self.split_words = split_words
2529 +        self.lang = lang
2530 +
2531 +    def __call__(self, token):
2532 +        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
2533 +        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
2534 +            return None
2535 +        if not self.split_words and token.word.lower() in stopwords:
2536 +            return None
2537 +        return token
2538 +
2539 +
2540 +class NerHashTagPreprocessor(AbstractNerPreprocessor):
2541 +    """ Cleanup hashtag
2542 +    """
2543 +    def __call__(self, token):
2544 +        if token.word.startswith('@'):
2545 +            # XXX Split capitalize letter ?
2546 +            # @BarackObama -> Barack Obama
2547 +            word = token.word[1:].replace('_', ' ')
2548 +            return Token(word, token.start, token.end, token.sentence)
2549 +        return token
diff --git a/ner/sources.py b/ner/sources.py
@@ -0,0 +1,124 @@
2550 +# -*- coding: utf-8 -*-
2551 +""" Sources for Named Entities Recognition.
2552 +"""
2553 +from nazca.utils.tokenizer import Token
2554 +from nazca.utils.dataio import sparqlquery, rqlquery
2555 +
2556 +
2557 +###############################################################################
2558 +### NER SOURCE ################################################################
2559 +###############################################################################
2560 +class AbstractNerSource(object):
2561 +    """ High-level source for Named Entities Recognition
2562 +    """
2563 +
2564 +    def __init__(self, endpoint, query, name=None, use_cache=True, preprocessors=None):
2565 +        """ Initialise the class.
2566 +        """
2567 +        self.endpoint = endpoint
2568 +        self.query = query
2569 +        self.name = name
2570 +        self.preprocessors = preprocessors or []
2571 +        self.use_cache = use_cache
2572 +        self._recognized_cache = {}
2573 +
2574 +    def add_preprocessors(self, preprocessor):
2575 +        """ Add a preprocessor
2576 +        """
2577 +        self.preprocessors.append(preprocessor)
2578 +
2579 +    def recognize_token(self, token):
2580 +        """ Recognize a token
2581 +        """
2582 +        # Applies source specific preprocessors
2583 +        for preprocessor in self.preprocessors:
2584 +            token = preprocessor(token)
2585 +            if not token:
2586 +                return []
2587 +        if self.use_cache and token.word in self._recognized_cache:
2588 +            return self._recognized_cache[token.word]
2589 +        uris = self.query_word(token.word) if token.word else []
2590 +        if self.use_cache:
2591 +            self._recognized_cache[token.word] = uris
2592 +        return uris
2593 +
2594 +    def query_word(self, word):
2595 +        """ Query a word for a Named Entities Recognition process
2596 +        """
2597 +        raise NotImplementedError
2598 +
2599 +
2600 +class NerSourceLexicon(AbstractNerSource):
2601 +    """ Source based on a (pre-computed) dictionnary of words (token, uri)
2602 +    """
2603 +    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
2604 +        self.lexicon = lexicon
2605 +        self.name = name
2606 +        self.preprocessors = preprocessors or []
2607 +        self.use_cache = use_cache
2608 +        self._recognized_cache = {}
2609 +
2610 +    def query_word(self, word):
2611 +        uri = self.lexicon.get(word)
2612 +        return [uri,] if uri else []
2613 +
2614 +
2615 +class NerSourceLocalRql(AbstractNerSource):
2616 +    """ High-level source for Named Entities Recognition
2617 +    Local RQL version
2618 +    """
2619 +
2620 +    def __init__(self, session, query, name=None, use_cache=True, preprocessors=None):
2621 +        """ Initialise the class.
2622 +        """
2623 +        self.query = query
2624 +        self.session = session
2625 +        self.name = name
2626 +        self.preprocessors = preprocessors or []
2627 +        self.use_cache = use_cache
2628 +        self._recognized_cache = {}
2629 +
2630 +    def query_word(self, word):
2631 +        """ Query a word for a Named Entities Recognition process
2632 +        """
2633 +        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
2634 +
2635 +
2636 +class NerSourceRql(AbstractNerSource):
2637 +    """ High-level source for Named Entities Recognition
2638 +    Url version (distant source)
2639 +    """
2640 +
2641 +    def query_word(self, word):
2642 +        """ Query a word for a Named Entities Recognition process
2643 +        """
2644 +        if self.endpoint.startswith('http://'):
2645 +            # url
2646 +            return [r[0] for r in rqlquery(self.endpoint, self.query % {'word': word})]
2647 +        else:
2648 +            return [r[0] for r in rqlquery(self.endpoint, self.query, word=word)]
2649 +
2650 +
2651 +class NerSourceSparql(AbstractNerSource):
2652 +    """ High-level source for Named Entities Recognition
2653 +    SPARQL version
2654 +
2655 +   >>> from ner.core import NerSourceSparql
2656 +   >>> ner_source = NerSourceSparql('''SELECT ?uri
2657 +                                         WHERE{
2658 +                                         ?uri rdfs:label "%(word)s"@en}''',
2659 +			                 'http://dbpedia.org/sparql')
2660 +   >>> print ner_source.recognize_token('Victor Hugo')
2661 +		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
2662 +		     'http://dbpedia.org/resource/Victor_Hugo',
2663 +		     'http://dbpedia.org/class/yago/VictorHugo',
2664 +		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
2665 +		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
2666 +		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
2667 +
2668 +    """
2669 +
2670 +    def query_word(self, word):
2671 +        """ Query a word for a Named Entities Recognition process
2672 +        """
2673 +        return [r[0] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
diff --git a/record_linkage/__init__.py b/record_linkage/__init__.py
diff --git a/record_linkage/aligner.py b/record_linkage/aligner.py
@@ -1,324 +0,0 @@
2674 -# -*- coding:utf-8 -*-
2675 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
2676 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
2677 -#
2678 -# This program is free software: you can redistribute it and/or modify it under
2679 -# the terms of the GNU Lesser General Public License as published by the Free
2680 -# Software Foundation, either version 2.1 of the License, or (at your option)
2681 -# any later version.
2682 -#
2683 -# This program is distributed in the hope that it will be useful, but WITHOUT
2684 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
2685 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
2686 -# details.
2687 -#
2688 -# You should have received a copy of the GNU Lesser General Public License along
2689 -# with this program. If not, see <http://www.gnu.org/licenses/>.
2690 -import time
2691 -import logging
2692 -from collections import defaultdict
2693 -
2694 -from scipy import zeros
2695 -from scipy.sparse import lil_matrix
2696 -
2697 -from nazca.utils.dataio import parsefile
2698 -
2699 -
2700 -###############################################################################
2701 -### UTILITY FUNCTIONS #########################################################
2702 -###############################################################################
2703 -def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
2704 -    """ Return the aligned pairs
2705 -    """
2706 -    if unique:
2707 -        for refid in global_matched:
2708 -            bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
2709 -            ref_record = refset[refid]
2710 -            target_record = targetset[bestid]
2711 -            distance = global_mat[refid, bestid] if global_mat is not None else None
2712 -            yield (ref_record[0], refid), (target_record[0], bestid), distance
2713 -    else:
2714 -        for refid in global_matched:
2715 -            for targetid, _ in global_matched[refid]:
2716 -                ref_record = refset[refid]
2717 -                target_record = targetset[targetid]
2718 -                distance = global_mat[refid, targetid] if global_mat is not None else None
2719 -                yield (ref_record[0], refid), (target_record[0], targetid), distance
2720 -
2721 -
2722 -###############################################################################
2723 -### BASE ALIGNER OBJECT #######################################################
2724 -###############################################################################
2725 -class BaseAligner(object):
2726 -
2727 -    def __init__(self, threshold, processings, normalize_matrix=False):
2728 -        self.threshold = threshold
2729 -        self.processings = processings
2730 -        self.normalize_matrix = normalize_matrix
2731 -        self.ref_normalizer = None
2732 -        self.target_normalizer = None
2733 -        self.target_normalizer = None
2734 -        self.blocking = None
2735 -        self.alignments_done = 0
2736 -        self.pairs_found = 0
2737 -        self.nb_comparisons = 0
2738 -        self.nb_blocks = 0
2739 -        self.refset_size = None
2740 -        self.targetset_size = None
2741 -        self.time = None
2742 -        self.logger = logging.getLogger('nazca.aligner')
2743 -
2744 -    def register_ref_normalizer(self, normalizer):
2745 -        """ Register normalizers to be applied
2746 -        before alignment """
2747 -        self.ref_normalizer = normalizer
2748 -
2749 -    def register_target_normalizer(self, normalizer):
2750 -        """ Register normalizers to be applied
2751 -        before alignment """
2752 -        self.target_normalizer = normalizer
2753 -
2754 -    def register_blocking(self, blocking):
2755 -        self.blocking = blocking
2756 -
2757 -    def apply_normalization(self, dataset, normalizer):
2758 -        if normalizer:
2759 -            return normalizer.normalize_dataset(dataset)
2760 -        return dataset
2761 -
2762 -    def compute_distance_matrix(self, refset, targetset,
2763 -                                ref_indexes, target_indexes):
2764 -        """ Compute and return the global alignment matrix.
2765 -        For each `processing` a `Distancematrix` is built, then all the
2766 -        matrices are summed with their own weighting and the result is the global
2767 -        alignment matrix, which is returned.
2768 -        """
2769 -        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
2770 -        for processing in self.processings:
2771 -            distmatrix += processing.cdist(refset, targetset,
2772 -                                          ref_indexes, target_indexes)
2773 -        return distmatrix
2774 -
2775 -    def threshold_matched(self, distmatrix):
2776 -        """ Return the matched elements within a dictionnary,
2777 -        each key being the indice from X, and the corresponding
2778 -        values being a list of couple (indice from Y, distance)
2779 -        """
2780 -        match = defaultdict(list)
2781 -        if self.normalize_matrix:
2782 -            distmatrix /= distmatrix.max()
2783 -        ind = (distmatrix <= self.threshold).nonzero()
2784 -        indrow = ind[0].tolist()
2785 -        indcol = ind[1].tolist()
2786 -        for (i, j) in zip(indrow, indcol):
2787 -            match[i].append((j, distmatrix[i, j]))
2788 -        return match
2789 -
2790 -    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
2791 -        # Build items
2792 -        items = []
2793 -        ref_indexes = ref_indexes or xrange(len(refset))
2794 -        target_indexes = target_indexes or xrange(len(targetset))
2795 -        # Apply alignments
2796 -        mat = self.compute_distance_matrix(refset, targetset,
2797 -                                           ref_indexes=ref_indexes,
2798 -                                           target_indexes=target_indexes)
2799 -        matched = self.threshold_matched(mat)
2800 -        # Reapply matched to global indexes
2801 -        new_matched = {}
2802 -        for k, values in matched.iteritems():
2803 -            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
2804 -        return mat, new_matched
2805 -
2806 -    def align(self, refset, targetset, get_matrix=True):
2807 -        """ Perform the alignment on the referenceset
2808 -        and the targetset
2809 -        """
2810 -        start_time = time.time()
2811 -        refset = self.apply_normalization(refset, self.ref_normalizer)
2812 -        targetset = self.apply_normalization(targetset, self.target_normalizer)
2813 -        self.refset_size = len(refset)
2814 -        self.targetset_size = len(targetset)
2815 -        # If no blocking
2816 -        if not self.blocking:
2817 -            return self._get_match(refset, targetset)
2818 -        # Blocking == conquer_and_divide
2819 -        global_matched = {}
2820 -        global_mat = lil_matrix((len(refset), len(targetset)))
2821 -        self.blocking.fit(refset, targetset)
2822 -        for refblock, targetblock in self.blocking.iter_blocks():
2823 -            self.nb_blocks += 1
2824 -            ref_index = [r[0] for r in refblock]
2825 -            target_index = [r[0] for r in targetblock]
2826 -            self.nb_comparisons += len(ref_index)*len(target_index)
2827 -            _, matched = self._get_match(refset, targetset, ref_index, target_index)
2828 -            for k, values in matched.iteritems():
2829 -                subdict = global_matched.setdefault(k, set())
2830 -                for v, d in values:
2831 -                    subdict.add((v, d))
2832 -                    self.alignments_done += 1
2833 -                    if get_matrix:
2834 -                        # XXX avoid issue in sparse matrix
2835 -                        global_mat[k, v] = d or 10**(-10)
2836 -        self.time = time.time() - start_time
2837 -        return global_mat, global_matched
2838 -
2839 -    def get_aligned_pairs(self, refset, targetset, unique=True):
2840 -        """ Get the pairs of aligned elements
2841 -        """
2842 -        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
2843 -        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
2844 -            self.pairs_found += 1
2845 -            yield pair
2846 -        self.log_infos()
2847 -
2848 -    def align_from_files(self, reffile, targetfile,
2849 -                         ref_indexes=None, target_indexes=None,
2850 -                         ref_encoding=None, target_encoding=None,
2851 -                         ref_separator='\t', target_separator='\t',
2852 -                         get_matrix=True):
2853 -        """ Align data from files
2854 -
2855 -        Parameters
2856 -        ----------
2857 -
2858 -        reffile: name of the reference file
2859 -
2860 -        targetfile: name of the target file
2861 -
2862 -        ref_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
2863 -                      be used to read the files.
2864 -
2865 -        target_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
2866 -                         be used to read the files.
2867 -
2868 -        ref_separator: separator of the reference file
2869 -
2870 -        target_separator: separator of the target file
2871 -        """
2872 -        refset = parsefile(reffile, indexes=ref_indexes,
2873 -                           encoding=ref_encoding, delimiter=ref_separator)
2874 -        targetset = parsefile(targetfile, indexes=target_indexes,
2875 -                              encoding=target_encoding, delimiter=target_separator)
2876 -        return self.align(refset, targetset, get_matrix=get_matrix)
2877 -
2878 -    def get_aligned_pairs_from_files(self, reffile, targetfile,
2879 -                         ref_indexes=None, target_indexes=None,
2880 -                         ref_encoding=None, target_encoding=None,
2881 -                         ref_separator='\t', target_separator='\t',
2882 -                         unique=True):
2883 -        """ Get the pairs of aligned elements
2884 -        """
2885 -        refset = parsefile(reffile, indexes=ref_indexes,
2886 -                           encoding=ref_encoding, delimiter=ref_separator)
2887 -        targetset = parsefile(targetfile, indexes=target_indexes,
2888 -                              encoding=target_encoding, delimiter=target_separator)
2889 -        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
2890 -        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
2891 -            yield pair
2892 -
2893 -    def log_infos(self):
2894 -        """ Display some info on the aligner process
2895 -        """
2896 -        self.logger.info('Computation time : %s' % self.time)
2897 -        self.logger.info('Size reference set : %s' % self.refset_size)
2898 -        self.logger.info('Size target set : %s' % self.targetset_size)
2899 -        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
2900 -        self.logger.info('Alignments done : %s' % self.alignments_done)
2901 -        self.logger.info('Pairs found : %s' % self.pairs_found)
2902 -        self.logger.info('Ratio reference set/alignments done : %s'
2903 -                         % (self.alignments_done/float(self.refset_size)))
2904 -        self.logger.info('Ratio target set/alignments done : %s'
2905 -                         % (self.alignments_done/float(self.targetset_size)))
2906 -        self.logger.info('Ratio reference set/pairs found : %s'
2907 -                         % (self.pairs_found/float(self.refset_size)))
2908 -        self.logger.info('Ratio target set/pairs found : %s'
2909 -                         % (self.pairs_found/float(self.targetset_size)))
2910 -        self.logger.info('Maximum comparisons : %s'
2911 -                         % (self.refset_size * self.targetset_size))
2912 -        self.logger.info('Number of blocks : %s' % self.nb_blocks)
2913 -        if self.nb_blocks:
2914 -            self.logger.info('Ratio comparisons/block : %s'
2915 -                             % (float(self.nb_comparisons)/self.nb_blocks))
2916 -        self.logger.info('Blocking reduction : %s'
2917 -                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
2918 -
2919 -
2920 -###############################################################################
2921 -### PIPELINE ALIGNER OBJECT ##################################################
2922 -###############################################################################
2923 -class PipelineAligner(object):
2924 -    """ This pipeline will perform iterative alignments, removing each time
2925 -    the aligned results from the previous aligner.
2926 -    """
2927 -
2928 -    def __init__(self, aligners):
2929 -        self.aligners = aligners
2930 -        self.pairs = {}
2931 -        self.nb_comparisons = 0
2932 -        self.nb_blocks = 0
2933 -        self.alignments_done = 0
2934 -        self.pairs_found = 0
2935 -        self.refset_size = None
2936 -        self.targetset_size = None
2937 -        self.time = None
2938 -        self.logger = logging.getLogger('nazca.aligner')
2939 -
2940 -    def get_aligned_pairs(self, refset, targetset, unique=True):
2941 -        """ Get the pairs of aligned elements
2942 -        """
2943 -        start_time = time.time()
2944 -        ref_index = range(len(refset))
2945 -        target_index = range(len(targetset))
2946 -        self.refset_size = len(refset)
2947 -        self.targetset_size = len(targetset)
2948 -        global_matched = {}
2949 -        global_mat = lil_matrix((len(refset), len(targetset)))
2950 -        seen_refset = set()
2951 -        # Iteration over aligners
2952 -        for ind_aligner, aligner in enumerate(self.aligners):
2953 -            # Perform alignment
2954 -            _refset = [refset[i] for i in ref_index]
2955 -            _targetset = [targetset[i] for i in target_index]
2956 -            for pair in aligner.get_aligned_pairs(_refset, _targetset, unique):
2957 -                self.pairs_found += 1
2958 -                pair = ((pair[0][0], ref_index[pair[0][1]]),
2959 -                        (pair[1][0], target_index[pair[1][1]]))
2960 -                yield pair
2961 -                seen_refset.add(pair[0][1])
2962 -            # Store stats
2963 -            self.nb_blocks += aligner.nb_blocks
2964 -            self.nb_comparisons += aligner.nb_comparisons
2965 -            # Update indexes if necessary
2966 -            # For now, we remove all the reference set that are already matched
2967 -            if ind_aligner < len(self.aligners) - 1:
2968 -                # There are other aligners after this one
2969 -                ref_index = [i for i in ref_index if i not in seen_refset]
2970 -        self.time = time.time() - start_time
2971 -        self.log_infos()
2972 -
2973 -    def log_infos(self):
2974 -        """ Display some info on the aligner process
2975 -        """
2976 -        self.logger.info('Computation time : %s' % self.time)
2977 -        self.logger.info('Size reference set : %s' % self.refset_size)
2978 -        self.logger.info('Size target set : %s' % self.targetset_size)
2979 -        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
2980 -        self.logger.info('Alignments done : %s' % self.alignments_done)
2981 -        self.logger.info('Pairs found : %s' % self.pairs_found)
2982 -        self.logger.info('Ratio reference set/alignments done : %s'
2983 -                         % (self.alignments_done/float(self.refset_size)))
2984 -        self.logger.info('Ratio target set/alignments done : %s'
2985 -                         % (self.alignments_done/float(self.targetset_size)))
2986 -        self.logger.info('Ratio reference set/pairs found : %s'
2987 -                         % (self.pairs_found/float(self.refset_size)))
2988 -        self.logger.info('Ratio target set/pairs found : %s'
2989 -                         % (self.pairs_found/float(self.targetset_size)))
2990 -        self.logger.info('Maximum comparisons : %s'
2991 -                         % (self.refset_size * self.targetset_size))
2992 -        self.logger.info('Number of blocks : %s' % self.nb_blocks)
2993 -        if self.nb_blocks:
2994 -            self.logger.info('Ratio comparisons/block : %s'
2995 -                             % (float(self.nb_comparisons)/self.nb_blocks))
2996 -        self.logger.info('Blocking reduction : %s'
2997 -                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
diff --git a/record_linkage/blocking.py b/record_linkage/blocking.py
@@ -1,666 +0,0 @@
2998 -# -*- coding:utf-8 -*-
2999 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
3000 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
3001 -#
3002 -# This program is free software: you can redistribute it and/or modify it under
3003 -# the terms of the GNU Lesser General Public License as published by the Free
3004 -# Software Foundation, either version 2.1 of the License, or (at your option)
3005 -# any later version.
3006 -#
3007 -# This program is distributed in the hope that it will be useful, but WITHOUT
3008 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
3009 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
3010 -# details.
3011 -#
3012 -# You should have received a copy of the GNU Lesser General Public License along
3013 -# with this program. If not, see <http://www.gnu.org/licenses/>.
3014 -
3015 -
3016 -""" Blocking techniques.
3017 -
3018 -This module implements a set of blocking techniques used to split
3019 -datasets in smaller subsets that will be aligned in more details.
3020 -
3021 -Additional information:
3022 -
3023 -   P. Christen, Data Matching, Data-Centric Systems and Applications,
3024 -
3025 -
3026 -"""
3027 -from functools import partial
3028 -import warnings
3029 -
3030 -from scipy.spatial import KDTree
3031 -
3032 -from nazca.utils.minhashing import Minlsh
3033 -from nazca.utils.distances import soundexcode
3034 -
3035 -
3036 -###############################################################################
3037 -### GENERAL BLOCKING ##########################################################
3038 -###############################################################################
3039 -class BaseBlocking(object):
3040 -    """ An abstract general blocking object that exposes
3041 -    the API that should be common to all blockings object
3042 -    """
3043 -    def __init__(self, ref_attr_index, target_attr_index):
3044 -        """ Build the blocking object
3045 -
3046 -        Parameters
3047 -        ----------
3048 -
3049 -        ref_attr_index: index of the attribute of interest in a record
3050 -                        for the reference dataset
3051 -                        (i.e. attribute to be used for key computation)
3052 -
3053 -        target_attr_index: index of the attribute of interest in a record
3054 -                           for the target dataset
3055 -                           (i.e. attribute to be used for key computation)
3056 -        """
3057 -        self.ref_attr_index = ref_attr_index
3058 -        self.target_attr_index = target_attr_index
3059 -        self.refids = None
3060 -        self.targetids = None
3061 -        self.is_fitted = False
3062 -
3063 -    def _fit(self, refset, targetset):
3064 -        raise NotImplementedError
3065 -
3066 -    def _iter_blocks(self):
3067 -        """ Internal iteration function over blocks
3068 -        """
3069 -        raise NotImplementedError
3070 -
3071 -    def _cleanup(self):
3072 -        """ Internal cleanup blocking for further use (e.g. in pipeline)
3073 -        """
3074 -        raise NotImplementedError
3075 -
3076 -    def fit(self, refset, targetset):
3077 -        """ Fit the blocking technique on the reference and target datasets
3078 -
3079 -        Parameters
3080 -        ----------
3081 -        refset: a dataset (list of records)
3082 -
3083 -        targetset: a dataset (list of records)
3084 -        """
3085 -        self._fit(refset, targetset)
3086 -        # Keep ids for blocks building
3087 -        self.refids = [(i, r[0]) for i, r in enumerate(refset)]
3088 -        self.targetids = [(i, r[0]) for i, r in enumerate(targetset)]
3089 -        self.is_fitted = True
3090 -
3091 -    def iter_blocks(self):
3092 -        """ Iterator over the different possible blocks.
3093 -
3094 -        Returns
3095 -        -------
3096 -
3097 -        (block1, block2): The blocks are always (reference_block, target_block)
3098 -                          and contains the pair (index, id) of the record in the
3099 -                          corresponding dataset.
3100 -        """
3101 -        assert self.is_fitted
3102 -        return self._iter_blocks()
3103 -
3104 -    def iter_indice_blocks(self):
3105 -        """ Iterator over the different possible blocks.
3106 -
3107 -        Returns
3108 -        -------
3109 -
3110 -        (block1, block2): The blocks are always (reference_block, target_block)
3111 -                          and contains the indexes of the record in the
3112 -                          corresponding dataset.
3113 -        """
3114 -        assert self.is_fitted
3115 -        for block1, block2 in self._iter_blocks():
3116 -            yield [r[0] for r in block1], [r[0] for r in block2]
3117 -
3118 -    def iter_id_blocks(self):
3119 -        """ Iterator over the different possible blocks.
3120 -
3121 -        Returns
3122 -        -------
3123 -
3124 -        (block1, block2): The blocks are always (reference_block, target_block)
3125 -                          and contains the ids of the record in the
3126 -                          corresponding dataset.
3127 -        """
3128 -        assert self.is_fitted
3129 -        for block1, block2 in self._iter_blocks():
3130 -            yield [r[1] for r in block1], [r[1] for r in block2]
3131 -
3132 -    def iter_pairs(self):
3133 -        """ Iterator over the different possible pairs.
3134 -
3135 -        Returns
3136 -        -------
3137 -
3138 -        (pair1, pari2): The pairs are always ((ind_reference, id_reference),
3139 -                                              (ind_target, id_target))
3140 -                        and are the ids of the record in the corresponding dataset.
3141 -        """
3142 -        assert self.is_fitted
3143 -        for block1, block2 in self.iter_blocks():
3144 -            for val1 in block1:
3145 -                for val2 in block2:
3146 -                    yield val1, val2
3147 -
3148 -    def iter_indice_pairs(self):
3149 -        """ Iterator over the different possible pairs.
3150 -
3151 -        Returns
3152 -        -------
3153 -
3154 -        (pair1, pari2): The pairs are always (ind_reference, ind_target)
3155 -                        and are the ids of the record in the corresponding dataset.
3156 -        """
3157 -        assert self.is_fitted
3158 -        for block1, block2 in self.iter_indice_blocks():
3159 -            for val1 in block1:
3160 -                for val2 in block2:
3161 -                    yield val1, val2
3162 -
3163 -    def iter_id_pairs(self):
3164 -        """ Iterator over the different possible pairs.
3165 -
3166 -        Returns
3167 -        -------
3168 -
3169 -        (pair1, pari2): The pairs are always (id_reference, id_target)
3170 -                        and are the ids of the record in the corresponding dataset.
3171 -        """
3172 -        assert self.is_fitted
3173 -        for block1, block2 in self.iter_id_blocks():
3174 -            for val1 in block1:
3175 -                for val2 in block2:
3176 -                    yield val1, val2
3177 -
3178 -    def cleanup(self):
3179 -        """ Cleanup blocking for further use (e.g. in pipeline)
3180 -        """
3181 -        self.is_fitted = True
3182 -        self._cleanup()
3183 -
3184 -
3185 -###############################################################################
3186 -### KEY BLOCKING ##############################################################
3187 -###############################################################################
3188 -class KeyBlocking(BaseBlocking):
3189 -    """ This blocking technique is based on a a blocking criteria
3190 -    (or blocking key), that will be used to divide the datasets.
3191 -
3192 -    The main idea here is:
3193 -
3194 -    1 - to create an index of f(x) for each x in the reference set.
3195 -
3196 -    2 - to create an index of f(y) for each y in the target set.
3197 -
3198 -    3 - to iterate on each distinct value of f(x) and to return
3199 -        the identifiers of the records of the both sets for this value.
3200 -    """
3201 -
3202 -    def __init__(self, ref_attr_index, target_attr_index, callback, ignore_none=False):
3203 -        super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
3204 -        self.callback = callback
3205 -        self.ignore_none = ignore_none
3206 -        self.reference_index = {}
3207 -        self.target_index = {}
3208 -
3209 -    def _fit(self, refset, targetset):
3210 -        """ Fit a dataset in an index using the callback
3211 -        """
3212 -        for ind, rec in enumerate(refset):
3213 -            key = self.callback(rec[self.ref_attr_index])
3214 -            if not key and self.ignore_none:
3215 -                continue
3216 -            self.reference_index.setdefault(key, []).append((ind, rec[0]))
3217 -        for ind, rec in enumerate(targetset):
3218 -            key = self.callback(rec[self.target_attr_index])
3219 -            if not key and self.ignore_none:
3220 -                continue
3221 -            self.target_index.setdefault(key, []).append((ind, rec[0]))
3222 -
3223 -    def _iter_blocks(self):
3224 -        """ Iterator over the different possible blocks.
3225 -
3226 -        Returns
3227 -        -------
3228 -
3229 -        (block1, block2): The blocks are always (reference_block, target_block)
3230 -                          and containts the indexes of the record in the
3231 -                          corresponding dataset.
3232 -        """
3233 -        for key, block1 in self.reference_index.iteritems():
3234 -            block2 = self.target_index.get(key)
3235 -            if block1 and block2:
3236 -                yield (block1, block2)
3237 -
3238 -    def _cleanup(self):
3239 -        """ Cleanup blocking for further use (e.g. in pipeline)
3240 -        """
3241 -        self.reference_index = {}
3242 -        self.target_index = {}
3243 -
3244 -
3245 -class SoundexBlocking(KeyBlocking):
3246 -
3247 -    def __init__(self, ref_attr_index, target_attr_index, language='french',):
3248 -        super(SoundexBlocking, self).__init__(ref_attr_index, target_attr_index,
3249 -                                              partial(soundexcode, language=language))
3250 -
3251 -
3252 -###############################################################################
3253 -### BIGRAM BLOCKING ###########################################################
3254 -###############################################################################
3255 -class NGramBlocking(BaseBlocking):
3256 -    """ This blocking technique is based on a a n-gram key.
3257 -    """
3258 -
3259 -    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
3260 -        super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
3261 -        self.ngram_size = ngram_size
3262 -        self.depth = depth
3263 -        self.reference_index = {}
3264 -        self.target_index = {}
3265 -
3266 -    def _fit_dataset(self, dataset, cur_index, attr_index):
3267 -        """ Fit a dataset
3268 -        """
3269 -        for ind, r in enumerate(dataset):
3270 -            cur_dict = cur_index
3271 -            text = r[attr_index]
3272 -            for i in range(self.depth):
3273 -                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
3274 -                if i < self.depth - 1:
3275 -                    cur_dict = cur_dict.setdefault(ngram, {})
3276 -            cur_dict.setdefault(ngram, []).append((ind, r[0]))
3277 -
3278 -    def _fit(self, refset, targetset):
3279 -        """ Fit the two sets (reference set and target set)
3280 -        """
3281 -        self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
3282 -        self._fit_dataset(targetset, self.target_index, self.target_attr_index)
3283 -
3284 -    def _iter_dict(self, ref_cur_dict, target_cur_dict):
3285 -        """ Iterative function used to create blocks from dicts
3286 -        """
3287 -        for key, sub_dict in ref_cur_dict.iteritems():
3288 -            if key in target_cur_dict:
3289 -                if isinstance(sub_dict, dict):
3290 -                    # There is another dict layer
3291 -                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
3292 -                        yield block1, block2
3293 -                else:
3294 -                    # This is a list
3295 -                    yield sub_dict, target_cur_dict[key]
3296 -
3297 -    def _iter_blocks(self):
3298 -        """ Iterator over the different possible blocks.
3299 -
3300 -        Returns
3301 -        -------
3302 -
3303 -        (block1, block2): The blocks are always (reference_block, target_block)
3304 -                          and containts the indexes of the record in the
3305 -                          corresponding dataset.
3306 -        """
3307 -        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
3308 -            if block1 and block2:
3309 -                yield block1, block2
3310 -
3311 -    def _cleanup(self):
3312 -        """ Cleanup blocking for further use (e.g. in pipeline)
3313 -        """
3314 -        self.reference_index = {}
3315 -        self.target_index = {}
3316 -
3317 -
3318 -###############################################################################
3319 -### SORTKEY BLOCKING ##########################################################
3320 -###############################################################################
3321 -class SortedNeighborhoodBlocking(BaseBlocking):
3322 -    """ This blocking technique is based on a a sorting blocking criteria
3323 -    (or blocking key), that will be used to divide the datasets.
3324 -    """
3325 -
3326 -    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
3327 -        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
3328 -        self.key_func = key_func
3329 -        self.window_width = window_width
3330 -        self.sorted_dataset = None
3331 -
3332 -    def _fit(self, refset, targetset):
3333 -        """ Fit a dataset in an index using the callback
3334 -        """
3335 -        self.sorted_dataset = [((ind, r[0]), r[self.ref_attr_index], 0)
3336 -                               for ind, r in enumerate(refset)]
3337 -        self.sorted_dataset.extend([((ind, r[0]), r[self.target_attr_index], 1)
3338 -                                    for ind, r in enumerate(targetset)])
3339 -        self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
3340 -
3341 -    def _iter_blocks(self):
3342 -        """ Iterator over the different possible blocks.
3343 -        """
3344 -        for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
3345 -            # Only keep reference set record
3346 -            if dset == 1:
3347 -                continue
3348 -            block1 = [rid,]
3349 -            minind = (ind - self.window_width)
3350 -            minind = minind if minind >=0 else 0
3351 -            maxind = (ind + self.window_width + 1)
3352 -            block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
3353 -                      if d == 1]
3354 -            if block1 and block2:
3355 -                yield (block1, block2)
3356 -
3357 -    def _cleanup(self):
3358 -        """ Cleanup blocking for further use (e.g. in pipeline)
3359 -        """
3360 -        self.sorted_dataset = None
3361 -
3362 -
3363 -###############################################################################
3364 -### MERGE BLOCKING ############################################################
3365 -###############################################################################
3366 -class MergeBlocking(BaseBlocking):
3367 -    """ This blocking technique keep only one appearance of one given values,
3368 -    and removes all the other records having this value.
3369 -    The merge is based on a score function
3370 -
3371 -    E.g.
3372 -      ('http://fr.wikipedia.org/wiki/Paris_%28Texas%29', 'Paris', 25898)
3373 -      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
3374 -
3375 -    could be (with a score function based on the population (third value):
3376 -
3377 -      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
3378 -
3379 -    !!! WARNING !!! This is only done on ONE set (the one with a non null attr index)
3380 -    """
3381 -
3382 -    def __init__(self, ref_attr_index, target_attr_index, score_func):
3383 -        super(MergeBlocking, self).__init__(ref_attr_index, target_attr_index)
3384 -        self.score_func = score_func
3385 -        self.merged_dataset = None
3386 -        self.other_dataset = None
3387 -        if ref_attr_index is None and target_attr_index is None:
3388 -            raise ValueError('At least one of ref_attr_index or target_attr_index '
3389 -                             'should not be None')
3390 -
3391 -    def _fit(self, refset, targetset):
3392 -        """ Fit a dataset in an index using the callback
3393 -        """
3394 -        if self.ref_attr_index is not None:
3395 -            # Merge refset
3396 -            self.merged_dataset = self._merge_dataset(refset, self.ref_attr_index)
3397 -            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(targetset)]
3398 -        else:
3399 -            # Merge targetset
3400 -            self.merged_dataset = self._merge_dataset(targetset, self.target_attr_index)
3401 -            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(refset)]
3402 -
3403 -    def _merge_dataset(self, dataset, attr_index):
3404 -        """ Merge a dataset
3405 -        """
3406 -        merged_dataset_dict = {}
3407 -        for ind, record in enumerate(dataset):
3408 -            score = self.score_func(record)
3409 -            if record[attr_index] not in merged_dataset_dict:
3410 -                # Create new entry
3411 -                merged_dataset_dict[record[attr_index]] = (ind, record, score)
3412 -            elif (record[attr_index] in merged_dataset_dict
3413 -                  and merged_dataset_dict[record[attr_index]][2] < score):
3414 -                # Change current score
3415 -                merged_dataset_dict[record[attr_index]] = (ind, record, score)
3416 -        return [(ind, r[0]) for ind, r, score in merged_dataset_dict.itervalues()]
3417 -
3418 -    def _iter_blocks(self):
3419 -        """ Iterator over the different possible blocks.
3420 -        """
3421 -        if self.ref_attr_index is not None:
3422 -            yield self.merged_dataset, self.other_dataset
3423 -        else:
3424 -            # self.target_attr_index is not None
3425 -            yield self.other_dataset, self.merged_dataset
3426 -
3427 -    def _cleanup(self):
3428 -        """ Cleanup blocking for further use (e.g. in pipeline)
3429 -        """
3430 -        self.merged_dataset = None
3431 -        self.other_dataset = None
3432 -
3433 -
3434 -###############################################################################
3435 -### CLUSTERING-BASED BLOCKINGS ################################################
3436 -###############################################################################
3437 -class KmeansBlocking(BaseBlocking):
3438 -    """ A blocking technique based on Kmeans
3439 -    """
3440 -
3441 -    def __init__(self, ref_attr_index, target_attr_index, n_clusters=None):
3442 -        super(KmeansBlocking, self).__init__(ref_attr_index, target_attr_index)
3443 -        self.n_clusters = n_clusters
3444 -        self.kmeans = None
3445 -        self.predicted = None
3446 -        from sklearn import cluster
3447 -        self.cluster_class = cluster.KMeans
3448 -
3449 -    def _fit(self, refset, targetset):
3450 -        """ Fit the reference dataset.
3451 -        """
3452 -        # If an element is None (missing), use instead the identity element.
3453 -        # The identity element is defined as the 0-vector
3454 -        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
3455 -        # We assume here that there are at least 2 elements in the refset
3456 -        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
3457 -        kmeans =  self.cluster_class(n_clusters=n_clusters)
3458 -        kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
3459 -        self.kmeans = kmeans
3460 -        # Predict on targetset
3461 -        self.predicted = self.kmeans.predict([elt[self.target_attr_index]
3462 -                                              or idelement for elt in targetset])
3463 -
3464 -    def _iter_blocks(self):
3465 -        """ Iterator over the different possible blocks.
3466 -
3467 -        Returns
3468 -        -------
3469 -
3470 -        (block1, block2): The blocks are always (reference_block, target_block)
3471 -                          and containts the indexes of the record in the
3472 -                          corresponding dataset.
3473 -        """
3474 -        neighbours = [[[], []] for _ in xrange(self.kmeans.n_clusters)]
3475 -        for ind, li in enumerate(self.predicted):
3476 -            neighbours[li][1].append(self.targetids[ind])
3477 -        for ind, li in enumerate(self.kmeans.labels_):
3478 -            neighbours[li][0].append(self.refids[ind])
3479 -        for block1, block2 in neighbours:
3480 -            if len(block1) and len(block2):
3481 -                yield block1, block2
3482 -
3483 -    def _cleanup(self):
3484 -        """ Cleanup blocking for further use (e.g. in pipeline)
3485 -        """
3486 -        self.kmeans = None
3487 -        self.predicted = None
3488 -
3489 -
3490 -###############################################################################
3491 -### KDTREE BLOCKINGS ##########################################################
3492 -###############################################################################
3493 -class KdTreeBlocking(BaseBlocking):
3494 -    """ A blocking technique based on KdTree
3495 -    """
3496 -    def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
3497 -        super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
3498 -        self.threshold = threshold
3499 -        self.reftree = None
3500 -        self.targettree = None
3501 -        self.nb_elements = None
3502 -
3503 -    def _fit(self, refset, targetset):
3504 -        """ Fit the blocking
3505 -        """
3506 -        firstelement = refset[0][self.ref_attr_index]
3507 -        self.nb_elements = len(refset)
3508 -        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
3509 -        idelement = (0,) * idsize
3510 -        # KDTree is expecting a two-dimensional array
3511 -        if idsize == 1:
3512 -            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
3513 -            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
3514 -        else:
3515 -            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
3516 -            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
3517 -
3518 -    def _iter_blocks(self):
3519 -        """ Iterator over the different possible blocks.
3520 -
3521 -        Returns
3522 -        -------
3523 -
3524 -        (block1, block2): The blocks are always (reference_block, target_block)
3525 -                          and containts the indexes of the record in the
3526 -                          corresponding dataset.
3527 -        """
3528 -        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
3529 -        neighbours = []
3530 -        for ind in xrange(self.nb_elements):
3531 -            if not extraneighbours[ind]:
3532 -                continue
3533 -            _ref = [self.refids[ind],]
3534 -            _target = [self.targetids[v] for v in extraneighbours[ind]]
3535 -            neighbours.append((_ref, _target))
3536 -        for block1, block2 in neighbours:
3537 -            if len(block1) and len(block2):
3538 -                yield block1, block2
3539 -
3540 -    def _cleanup(self):
3541 -        """ Cleanup blocking for further use (e.g. in pipeline)
3542 -        """
3543 -        self.reftree = None
3544 -        self.targettree = None
3545 -        self.nb_elements = None
3546 -
3547 -
3548 -###############################################################################
3549 -### MINHASHING BLOCKINGS ######################################################
3550 -###############################################################################
3551 -class MinHashingBlocking(BaseBlocking):
3552 -    """ A blocking technique based on MinHashing
3553 -    """
3554 -    def __init__(self, ref_attr_index, target_attr_index,
3555 -                 threshold=0.1, kwordsgram=1, siglen=200):
3556 -        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
3557 -        self.threshold = threshold
3558 -        self.kwordsgram = kwordsgram
3559 -        self.siglen = siglen
3560 -        self.minhasher = Minlsh()
3561 -        self.nb_elements = None
3562 -
3563 -    def _fit(self, refset, targetset):
3564 -        """ Find the blocking using minhashing
3565 -        """
3566 -        # If an element is None (missing), use instead the identity element.
3567 -        idelement = ''
3568 -        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
3569 -                        [elt[self.target_attr_index] or idelement for elt in targetset],
3570 -                        self.kwordsgram, self.siglen)
3571 -        self.nb_elements = len(refset)
3572 -
3573 -    def _iter_blocks(self):
3574 -        """ Iterator over the different possible blocks.
3575 -
3576 -        Returns
3577 -        -------
3578 -
3579 -        (block1, block2): The blocks are always (reference_block, target_block)
3580 -                          and containts the indexes of the record in the
3581 -                          corresponding dataset.
3582 -        """
3583 -        rawneighbours = self.minhasher.predict(self.threshold)
3584 -        neighbours = []
3585 -        for data in rawneighbours:
3586 -            neighbours.append([[], []])
3587 -            for i in data:
3588 -                if i >= self.nb_elements:
3589 -                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
3590 -                else:
3591 -                    neighbours[-1][0].append(self.refids[i])
3592 -            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
3593 -                neighbours.pop()
3594 -        for block1, block2 in neighbours:
3595 -            if len(block1) and len(block2):
3596 -                yield block1, block2
3597 -
3598 -    def _cleanup(self):
3599 -        """ Cleanup blocking for further use (e.g. in pipeline)
3600 -        """
3601 -        self.minhasher = Minlsh()
3602 -        self.nb_elements = None
3603 -
3604 -
3605 -###############################################################################
3606 -### BLOCKING PIPELINE #########################################################
3607 -###############################################################################
3608 -class PipelineBlocking(BaseBlocking):
3609 -    """ Pipeline multiple blocking techniques
3610 -    """
3611 -
3612 -    def __init__(self, blockings, collect_stats=False):
3613 -        """ Build the blocking object
3614 -
3615 -        Parameters
3616 -        ----------
3617 -
3618 -        blockings: ordered list of blocking objects
3619 -        """
3620 -        self.blockings = blockings
3621 -        self.stored_blocks = []
3622 -        self.collect_stats = collect_stats
3623 -        self.stats = {}
3624 -
3625 -    def _fit(self, refset, targetset):
3626 -        """ Internal fit of the pipeline """
3627 -        self._recursive_fit(refset, targetset, range(len(refset)), range(len(targetset)), 0)
3628 -
3629 -    def _recursive_fit(self, refset, targetset, ref_index, target_index, ind):
3630 -        """ Recursive fit of the blockings.
3631 -        Blocks are stored in the stored_blocks attribute.
3632 -        """
3633 -        if ind < len(self.blockings) - 1:
3634 -            # There are other blockings after this one
3635 -            blocking = self.blockings[ind]
3636 -            blocking.cleanup()
3637 -            blocking.fit([refset[i] for i in ref_index],
3638 -                         [targetset[i] for i in target_index])
3639 -            for block1, block2 in blocking.iter_indice_blocks():
3640 -                ind_block1 = [ref_index[i] for i in block1]
3641 -                ind_block2 = [target_index[i] for i in block2]
3642 -                if self.collect_stats:
3643 -                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
3644 -                self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
3645 -        else:
3646 -            # This is the final blocking
3647 -            blocking = self.blockings[ind]
3648 -            blocking.cleanup()
3649 -            blocking.fit([refset[i] for i in ref_index],
3650 -                         [targetset[i] for i in target_index])
3651 -            for block1, block2 in blocking.iter_blocks():
3652 -                ind_block1 = [(ref_index[i], _id) for i, _id in block1]
3653 -                ind_block2 = [(target_index[i], _id) for i, _id in block2]
3654 -                if self.collect_stats:
3655 -                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
3656 -                self.stored_blocks.append((ind_block1, ind_block2))
3657 -
3658 -    def _iter_blocks(self):
3659 -        """ Internal iteration function over blocks
3660 -        """
3661 -        for block1, block2 in self.stored_blocks:
3662 -            if block1 and block2:
3663 -                yield block1, block2
diff --git a/reference_data/__init__.py b/reference_data/__init__.py
diff --git a/reference_data/countries.py b/reference_data/countries.py
@@ -1,994 +0,0 @@
3664 -
3665 -# Countries list (ISO-3166)
3666 -COUNTRIES = {'##': 'non renseign\xc3\xa9',
3667 -             '..': 'non renseign\xc3\xa9',
3668 -             'aa': 'aire g\xc3\xa9ographique ancienne',
3669 -             'ad': 'Andorre',
3670 -             'ae': '\xc3\x89mirats arabes unis',
3671 -             'af': 'Afghanistan',
3672 -             'ag': 'Antigua-et-Barbuda',
3673 -             'ai': 'Anguilla',
3674 -             'al': 'Albanie',
3675 -             'am': 'Arm\xc3\xa9nie',
3676 -             'an': 'Antilles n\xc3\xa9erlandaises',
3677 -             'ao': 'Angola',
3678 -             'aq': 'Antarctique',
3679 -             'ar': 'Argentine',
3680 -             'as': 'Samoa am\xc3\xa9ricaines',
3681 -             'at': 'Autriche',
3682 -             'au': 'Australie',
3683 -             'aw': 'Aruba',
3684 -             'ax': 'Aland (\xc3\xaeles)',
3685 -             'az': 'Azerba\xc3\xafdjan',
3686 -             'ba': 'Bosnie-Herz\xc3\xa9govine',
3687 -             'bb': 'Barbade',
3688 -             'bd': 'Bangladesh',
3689 -             'be': 'Belgique',
3690 -             'bf': 'Burkina',
3691 -             'bg': 'Bulgarie',
3692 -             'bh': 'Bahre\xc3\xafn',
3693 -             'bi': 'Burundi',
3694 -             'bj': 'B\xc3\xa9nin',
3695 -             'bl': 'Saint-Barth\xc3\xa9lemy',
3696 -             'bm': 'Bermudes',
3697 -             'bn': 'Brun\xc3\xa9i',
3698 -             'bo': 'Bolivie',
3699 -             'bq': 'Bonaire, Saint-Eustache et Saba,Saba###Saint-Eustache',
3700 -             'br': 'Br\xc3\xa9sil',
3701 -             'bs': 'Bahamas',
3702 -             'bt': 'Bhoutan',
3703 -             'bv': 'Bouvet (\xc3\xaele)',
3704 -             'bw': 'Botswana',
3705 -             'by': 'Bi\xc3\xa9lorussie,B\xc3\xa9larus',
3706 -             'bz': 'Belize',
3707 -             'ca': 'Canada',
3708 -             'cc': 'Cocos (\xc3\xaeles),Keeling (\xc3\xaeles)',
3709 -             'cd': 'Congo (R\xc3\xa9publique d\xc3\xa9mocratique),Za\xc3\xafre',
3710 -             'cf': 'Centrafrique,R\xc3\xa9publique centrafricaine',
3711 -             'cg': 'Congo,Congo (R\xc3\xa9publique)',
3712 -             'ch': 'Suisse,Conf\xc3\xa9d\xc3\xa9ration helv\xc3\xa9tique',
3713 -             'ci': "C\xc3\xb4te d'Ivoire\n",
3714 -             'ck': 'Cook (\xc3\xaeles)',
3715 -             'cl': 'Chili',
3716 -             'cm': 'Cameroun',
3717 -             'cn': 'Chine,Chine (R\xc3\xa9publique populaire)',
3718 -             'co': 'Colombie',
3719 -             'cr': 'Costa Rica',
3720 -             'cs': 'Serbie-et-Mont\xc3\xa9n\xc3\xa9gro',
3721 -             'cu': 'Cuba',
3722 -             'cv': 'Cap-Vert',
3723 -             'cw': 'Cura\xc3\xa7ao',
3724 -             'cx': 'Christmas (\xc3\xaele)',
3725 -             'cy': 'Chypre',
3726 -             'cz': 'R\xc3\xa9publique tch\xc3\xa8que,Tch\xc3\xa8que, R\xc3\xa9publique',
3727 -             'dd': 'Allemagne (R\xc3\xa9publique d\xc3\xa9mocratique)',
3728 -             'de': 'Allemagne,Allemagne (R\xc3\xa9publique f\xc3\xa9d\xc3\xa9rale)',
3729 -             'dj': 'Djibouti',
3730 -             'dk': 'Danemark',
3731 -             'dm': 'Dominique',
3732 -             'do': 'R\xc3\xa9publique dominicaine,Dominicaine, R\xc3\xa9publique',
3733 -             'dz': 'Alg\xc3\xa9rie',
3734 -             'ec': '\xc3\x89quateur',
3735 -             'ee': 'Estonie',
3736 -             'eg': '\xc3\x89gypte',
3737 -             'eh': 'Sahara occidental',
3738 -             'er': '\xc3\x89rythr\xc3\xa9e',
3739 -             'es': 'Espagne',
3740 -             'et': '\xc3\x89thiopie',
3741 -             'fi': 'Finlande',
3742 -             'fj': 'Fidji',
3743 -             'fk': 'Malouines (\xc3\xaeles),Falkland (\xc3\xaeles)',
3744 -             'fm': 'Micron\xc3\xa9sie,\xc3\x89tats f\xc3\xa9d\xc3\xa9r\xc3\xa9s de Micron\xc3\xa9sie',
3745 -             'fo': 'F\xc3\xa9ro\xc3\xa9 (\xc3\xaeles)',
3746 -             'fr': 'France',
3747 -             'ga': 'Gabon',
3748 -             'gb': 'Grande-Bretagne,Royaume-Uni',
3749 -             'gd': 'Grenade',
3750 -             'ge': 'G\xc3\xa9orgie',
3751 -             'gf': 'Guyane fran\xc3\xa7aise',
3752 -             'gg': 'Guernesey',
3753 -             'gh': 'Ghana',
3754 -             'gi': 'Gibraltar',
3755 -             'gl': 'Groenland',
3756 -             'gm': 'Gambie',
3757 -             'gn': 'Guin\xc3\xa9e',
3758 -             'gp': 'Guadeloupe',
3759 -             'gq': 'Guin\xc3\xa9e \xc3\xa9quatoriale',
3760 -             'gr': 'Gr\xc3\xa8ce',
3761 -             'gs': 'G\xc3\xa9orgie du Sud et les \xc3\xaeles Sandwich du Sud',
3762 -             'gt': 'Guatemala',
3763 -             'gu': 'Guam',
3764 -             'gw': 'Guin\xc3\xa9e-Bissau',
3765 -             'gy': 'Guyana',
3766 -             'hk': 'Hong Kong',
3767 -             'hm': 'Heard (\xc3\xaele) et \xc3\xaeles McDonald',
3768 -             'hn': 'Honduras',
3769 -             'hr': 'Croatie',
3770 -             'ht': 'Ha\xc3\xafti',
3771 -             'hu': 'Hongrie',
3772 -             'id': 'Indon\xc3\xa9sie',
3773 -             'ie': 'Irlande',
3774 -             'ii': 'intergouvernemental',
3775 -             'il': 'Isra\xc3\xabl',
3776 -             'im': '\xc3\x8ele de Man,Man, \xc3\x8ele de',
3777 -             'in': 'Inde',
3778 -             'io': "Territoire britannique de l'Oc\xc3\xa9an indien,Chagos (\xc3\xaeles)###Oc\xc3\xa9an indien, Territoire britannique de l'\n",
3779 -             'iq': 'Irak',
3780 -             'ir': 'Iran',
3781 -             'is': 'Islande',
3782 -             'it': 'Italie',
3783 -             'je': 'Jersey',
3784 -             'jm': 'Jama\xc3\xafque',
3785 -             'jo': 'Jordanie',
3786 -             'jp': 'Japon',
3787 -             'ke': 'Kenya',
3788 -             'kg': 'Kirghizistan',
3789 -             'kh': 'Cambodge',
3790 -             'ki': 'Kiribati',
3791 -             'km': 'Comores',
3792 -             'kn': 'Saint-Kitts-et-Nevis,Saint-Christophe-et-Nevis',
3793 -             'ko': 'Kosovo',
3794 -             'kp': 'Cor\xc3\xa9e (R\xc3\xa9publique populaire d\xc3\xa9mocratique),Cor\xc3\xa9e du Nord',
3795 -             'kr': 'Cor\xc3\xa9e (R\xc3\xa9publique),Cor\xc3\xa9e du Sud',
3796 -             'kw': 'Kowe\xc3\xaft',
3797 -             'ky': 'Cayman,Ca\xc3\xafmanes, \xc3\x8eles###Ca\xc3\xafman (\xc3\xaeles)',
3798 -             'kz': 'Kazakhstan',
3799 -             'la': 'Laos',
3800 -             'lb': 'Liban',
3801 -             'lc': 'Sainte-Lucie',
3802 -             'li': 'Liechtenstein',
3803 -             'lk': 'Sri Lanka',
3804 -             'lr': 'Liberia',
3805 -             'ls': 'Lesotho',
3806 -             'lt': 'Lituanie',
3807 -             'lu': 'Luxembourg',
3808 -             'lv': 'Lettonie',
3809 -             'ly': 'Libye',
3810 -             'ma': 'Maroc',
3811 -             'mc': 'Monaco',
3812 -             'md': 'Moldavie,Moldova, R\xc3\xa9publique de',
3813 -             'me': 'Mont\xc3\xa9n\xc3\xa9gro',
3814 -             'mf': 'Saint-Martin (partie fran\xc3\xa7aise)',
3815 -             'mg': 'Madagascar',
3816 -             'mh': 'Marshall (\xc3\xaeles)',
3817 -             'mk': 'Mac\xc3\xa9doine (R\xc3\xa9publique)',
3818 -             'ml': 'Mali',
3819 -             'mm': 'Myanmar,Birmanie',
3820 -             'mn': 'Mongolie',
3821 -             'mo': 'Macao',
3822 -             'mp': 'Mariannes du Nord (\xc3\xaeles)',
3823 -             'mq': 'Martinique',
3824 -             'mr': 'Mauritanie',
3825 -             'ms': 'Montserrat',
3826 -             'mt': 'Malte',
3827 -             'mu': 'Maurice',
3828 -             'mv': 'Maldives',
3829 -             'mw': 'Malawi',
3830 -             'mx': 'Mexique',
3831 -             'my': 'Malaisie',
3832 -             'mz': 'Mozambique',
3833 -             'na': 'Namibie',
3834 -             'nc': 'Nouvelle-Cal\xc3\xa9donie',
3835 -             'ne': 'Niger',
3836 -             'nf': 'Norfolk (\xc3\xaele)',
3837 -             'ng': 'Nigeria',
3838 -             'ni': 'Nicaragua',
3839 -             'nl': 'Pays-Bas',
3840 -             'no': 'Norv\xc3\xa8ge',
3841 -             'np': 'N\xc3\xa9pal',
3842 -             'nr': 'Nauru',
3843 -             'nu': 'Niue',
3844 -             'nz': 'Nouvelle-Z\xc3\xa9lande',
3845 -             'om': 'Oman',
3846 -             'oo': 'code non adapt\xc3\xa9',
3847 -             'pa': 'Panama',
3848 -             'pe': 'P\xc3\xa9rou',
3849 -             'pf': 'Polyn\xc3\xa9sie fran\xc3\xa7aise',
3850 -             'pg': 'Papouasie-Nouvelle-Guin\xc3\xa9e',
3851 -             'ph': 'Philippines',
3852 -             'pk': 'Pakistan',
3853 -             'pl': 'Pologne',
3854 -             'pm': 'Saint-Pierre-et-Miquelon',
3855 -             'pn': 'Pitcairn',
3856 -             'pr': 'Porto Rico',
3857 -             'ps': 'Autorit\xc3\xa9 palestinienne,Palestine',
3858 -             'pt': 'Portugal',
3859 -             'pw': 'Palau,Palaos',
3860 -             'py': 'Paraguay',
3861 -             'qa': 'Qatar',
3862 -             're': 'R\xc3\xa9union',
3863 -             'ro': 'Roumanie',
3864 -             'rs': 'Serbie',
3865 -             'ru': 'Russie (F\xc3\xa9d\xc3\xa9ration),Russie',
3866 -             'rw': 'Rwanda',
3867 -             'sa': 'Arabie saoudite',
3868 -             'sb': 'Salomon (\xc3\xaeles)',
3869 -             'sc': 'Seychelles',
3870 -             'sd': 'Soudan',
3871 -             'se': 'Su\xc3\xa8de',
3872 -             'sg': 'Singapour',
3873 -             'sh': 'Sainte-H\xc3\xa9l\xc3\xa8ne,Ascension (\xc3\xaele)###Tristan da Cunha (\xc3\xaele)',
3874 -             'si': 'Slov\xc3\xa9nie',
3875 -             'sj': 'Svalbard et \xc3\xaele Jan Mayen',
3876 -             'sk': 'Slovaquie',
3877 -             'sl': 'Sierra Leone',
3878 -             'sm': 'Saint-Marin',
3879 -             'sn': 'S\xc3\xa9n\xc3\xa9gal',
3880 -             'so': 'Somalie',
3881 -             'sr': 'Suriname',
3882 -             'ss': 'Soudan du Sud,Sud Soudan',
3883 -             'st': 'Sao Tom\xc3\xa9-et-Principe',
3884 -             'su': 'URSS',
3885 -             'sv': 'El Salvador,Salvador',
3886 -             'sx': 'Saint-Martin (partie n\xc3\xa9erlandaise),Sint Maarten',
3887 -             'sy': 'Syrie',
3888 -             'sz': 'Swaziland',
3889 -             'tc': 'Turks et Ca\xc3\xafques (\xc3\xaeles)',
3890 -             'td': 'Tchad',
3891 -             'tf': 'Terres australes fran\xc3\xa7aises',
3892 -             'tg': 'Togo',
3893 -             'th': 'Tha\xc3\xaflande',
3894 -             'tj': 'Tadjikistan',
3895 -             'tk': 'Tokelau',
3896 -             'tl': 'Timor oriental',
3897 -             'tm': 'Turkm\xc3\xa9nistan',
3898 -             'tn': 'Tunisie',
3899 -             'to': 'Tonga',
3900 -             'tr': 'Turquie',
3901 -             'tt': 'Trinit\xc3\xa9-et-Tobago',
3902 -             'tv': 'Tuvalu',
3903 -             'tw': 'Ta\xc3\xafwan,Chine (R\xc3\xa9publique)',
3904 -             'tz': 'Tanzanie',
3905 -             'ua': 'Ukraine',
3906 -             'ug': 'Ouganda',
3907 -             'um': '\xc3\x8eles mineures \xc3\xa9loign\xc3\xa9es des \xc3\x89tats-Unis',
3908 -             'us': '\xc3\x89tats-Unis',
3909 -             'uy': 'Uruguay',
3910 -             'uz': 'Ouzb\xc3\xa9kistan',
3911 -             'va': 'Vatican,Saint-Si\xc3\xa8ge',
3912 -             'vc': 'Saint-Vincent-et-les Grenadines',
3913 -             've': 'Venezuela',
3914 -             'vg': '\xc3\x8eles Vierges britanniques,Vierges (\xc3\xaeles) britanniques',
3915 -             'vi': '\xc3\x8eles Vierges am\xc3\xa9ricaines,Vierges (\xc3\xaeles) am\xc3\xa9ricaines',
3916 -             'vn': 'Viet Nam',
3917 -             'vu': 'Vanuatu',
3918 -             'wf': 'Wallis et Futuna (\xc3\xaeles)',
3919 -             'ws': 'Samoa,Samoa occidentales',
3920 -             'xc': 'Tch\xc3\xa9coslovaquie',
3921 -             'xd': 'Allemagne avant 1945',
3922 -             'xe': 'Europe,Union europ\xc3\xa9enne',
3923 -             'xk': 'Cor\xc3\xa9e avant 1948',
3924 -             'xn': 'Pays-Bas avant 1830,Belgique avant 1830',
3925 -             'xx': 'inconnu',
3926 -             'yd': 'Y\xc3\xa9men (R\xc3\xa9publique d\xc3\xa9mocratique populaire),Sud Y\xc3\xa9men',
3927 -             'ye': 'Y\xc3\xa9men',
3928 -             'yt': 'Mayotte',
3929 -             'yu': 'Yougoslavie',
3930 -             'yy': "ne s'applique pas\n",
3931 -             'za': 'Afrique du Sud',
3932 -             'zm': 'Zambie',
3933 -             'zw': 'Zimbabwe',
3934 -             'zz': 'multiple\n'}
3935 -
3936 -
3937 -# REGIONS TO COUNTRIES MAPPING
3938 -REGIONS_TO_COUNTRIES = {u'Abruzzes': u'Italie',
3939 -                        u'Acha\xefe': u'Gr\xe8ce',
3940 -                        u'Acre': u'Br\xe9sil',
3941 -                        u'Afghanistan': u'Afghanistan',
3942 -                        u'Afrique du Sud': u'Afrique du Sud',
3943 -                        u'Aguascalientes': u'Mexique',
3944 -                        u'Ain': u'France',
3945 -                        u'Aisne': u'France',
3946 -                        u'Alabama': u'\xc9tats-Unis',
3947 -                        u'Alagoas': u'Br\xe9sil',
3948 -                        u'Aland (\xeeles)': u'Aland (\xeeles)',
3949 -                        u'Alaska': u'\xc9tats-Unis',
3950 -                        u'Albanie': u'Albanie',
3951 -                        u'Alberta': u'Canada',
3952 -                        u'Alg\xe9rie': u'Alg\xe9rie',
3953 -                        u'Allemagne': u'Allemagne',
3954 -                        u'Allemagne (R\xe9publique d\xe9mocratique)': u'Allemagne (R\xe9publique d\xe9mocratique)',
3955 -                        u'Allemagne avant 1945': u'Allemagne avant 1945',
3956 -                        u'Allier': u'France',
3957 -                        u'Alpes-Maritimes': u'France',
3958 -                        u'Alpes-de-Haute-Provence': u'France',
3959 -                        u'Alsace': u'France',
3960 -                        u'Amapa': u'Br\xe9sil',
3961 -                        u'Amazonas': u'Br\xe9sil',
3962 -                        u'Andalousie': u'Espagne',
3963 -                        u'Andorre': u'Andorre',
3964 -                        u'Angola': u'Angola',
3965 -                        u'Anguilla': u'Anguilla',
3966 -                        u'Antarctique': u'Antarctique',
3967 -                        u'Antigua-et-Barbuda': u'Antigua-et-Barbuda',
3968 -                        u'Antilles n\xe9erlandaises': u'Antilles n\xe9erlandaises',
3969 -                        u'Anvers': u'Belgique',
3970 -                        u'Appenzell-Rhodes-Ext\xe9rieures': u'Suisse',
3971 -                        u'Appenzell-Rhodes-Int\xe9rieures': u'Suisse',
3972 -                        u'Aquitaine': u'France',
3973 -                        u'Arabie saoudite': u'Arabie saoudite',
3974 -                        u'Aragon': u'Espagne',
3975 -                        u'Arcadie': u'Gr\xe8ce',
3976 -                        u'Ardennes': u'France',
3977 -                        u'Ard\xe8che': u'France',
3978 -                        u'Argentine': u'Argentine',
3979 -                        u'Argolide': u'Gr\xe8ce',
3980 -                        u'Argovie': u'Suisse',
3981 -                        u'Arizona': u'\xc9tats-Unis',
3982 -                        u'Ari\xe8ge': u'France',
3983 -                        u'Arkansas': u'\xc9tats-Unis',
3984 -                        u'Arm\xe9nie': u'Arm\xe9nie',
3985 -                        u'Aruba': u'Aruba',
3986 -                        u'Asturies': u'Espagne',
3987 -                        u'Ath\xe8nes et agglom\xe9ration': u'Gr\xe8ce',
3988 -                        u'Attique': u'Gr\xe8ce',
3989 -                        u'Aube': u'France',
3990 -                        u'Aude': u'France',
3991 -                        u'Australie': u'Australie',
3992 -                        u'Australie-M\xe9ridionale': u'Australie',
3993 -                        u'Australie-Occidentale': u'Australie',
3994 -                        u'Autorit\xe9 palestinienne': u'Autorit\xe9 palestinienne',
3995 -                        u'Autriche': u'Autriche',
3996 -                        u'Auvergne': u'France',
3997 -                        u'Aveyron': u'France',
3998 -                        u'Azerba\xefdjan': u'Azerba\xefdjan',
3999 -                        u'Bade-Wurtemberg': u'Allemagne',
4000 -                        u'Bahamas': u'Bahamas',
4001 -                        u'Bahia': u'Br\xe9sil',
4002 -                        u'Bahre\xefn': u'Bahre\xefn',
4003 -                        u'Baja California Norte': u'Mexique',
4004 -                        u'Baja California Sur': u'Mexique',
4005 -                        u'Bangladesh': u'Bangladesh',
4006 -                        u'Barbade': u'Barbade',
4007 -                        u'Bas-Rhin': u'France',
4008 -                        u'Basilicate': u'Italie',
4009 -                        u'Basse-Autriche': u'Autriche',
4010 -                        u'Basse-Normandie': u'France',
4011 -                        u'Basse-Saxe': u'Allemagne',
4012 -                        u'Bavi\xe8re': u'Allemagne',
4013 -                        u'Belgique': u'Belgique',
4014 -                        u'Belize': u'Belize',
4015 -                        u'Berlin': u'Allemagne',
4016 -                        u'Bermudes': u'Bermudes',
4017 -                        u'Berne': u'Suisse',
4018 -                        u'Bhoutan': u'Bhoutan',
4019 -                        u'Bi\xe9lorussie': u'Bi\xe9lorussie',
4020 -                        u'Bolivie': u'Bolivie',
4021 -                        u'Bonaire, Saint-Eustache et Saba': u'Bonaire, Saint-Eustache et Saba',
4022 -                        u'Bosnie-Herz\xe9govine': u'Bosnie-Herz\xe9govine',
4023 -                        u'Botswana': u'Botswana',
4024 -                        u'Bouches-du-Rh\xf4ne': u'France',
4025 -                        u'Bourgogne': u'France',
4026 -                        u'Bouvet (\xeele)': u'Bouvet (\xeele)',
4027 -                        u'Brabant': u'Belgique',
4028 -                        u'Brabant flamand': u'Belgique',
4029 -                        u'Brabant wallon': u'Belgique',
4030 -                        u'Brabant-Septentrional': u'Pays-Bas',
4031 -                        u'Brandebourg': u'Allemagne',
4032 -                        u'Bretagne': u'France',
4033 -                        u'Brun\xe9i': u'Brun\xe9i',
4034 -                        u'Bruxelles': u'Belgique',
4035 -                        u'Br\xe9sil': u'Br\xe9sil',
4036 -                        u'Br\xeame': u'Allemagne',
4037 -                        u'Buenos Aires': u'Argentine',
4038 -                        u'Bulgarie': u'Bulgarie',
4039 -                        u'Burgenland': u'Autriche',
4040 -                        u'Burkina': u'Burkina',
4041 -                        u'Burundi': u'Burundi',
4042 -                        u'B\xe2le-Campagne': u'Suisse',
4043 -                        u'B\xe2le-Ville': u'Suisse',
4044 -                        u'B\xe9nin': u'B\xe9nin',
4045 -                        u'B\xe9otie': u'Gr\xe8ce',
4046 -                        u'Calabre': u'Italie',
4047 -                        u'Californie': u'\xc9tats-Unis',
4048 -                        u'Calvados': u'France',
4049 -                        u'Cambodge': u'Cambodge',
4050 -                        u'Cameroun': u'Cameroun',
4051 -                        u'Campanie': u'Italie',
4052 -                        u'Campeche': u'Mexique',
4053 -                        u'Canada': u'Canada',
4054 -                        u'Canaries': u'Espagne',
4055 -                        u'Cantabrie': u'Espagne',
4056 -                        u'Cantal': u'France',
4057 -                        u'Cap-Vert': u'Cap-Vert',
4058 -                        u'Capitale f\xe9d\xe9rale': u'Argentine',
4059 -                        u'Carinthie': u'Autriche',
4060 -                        u'Caroline du Nord': u'\xc9tats-Unis',
4061 -                        u'Caroline du Sud': u'\xc9tats-Unis',
4062 -                        u'Castille et L\xe9on': u'Espagne',
4063 -                        u'Castille-la Manche': u'Espagne',
4064 -                        u'Catalogne': u'Espagne',
4065 -                        u'Catamarca': u'Argentine',
4066 -                        u'Cayman': u'Cayman',
4067 -                        u'Cear\xe1': u'Br\xe9sil',
4068 -                        u'Centrafrique': u'Centrafrique',
4069 -                        u'Centre': u'France',
4070 -                        u'Ceuta': u'Espagne',
4071 -                        u'Chaco': u'Argentine',
4072 -                        u'Chalcidique': u'Gr\xe8ce',
4073 -                        u'Champagne-Ardenne': u'France',
4074 -                        u'Charente': u'France',
4075 -                        u'Charente-Maritime': u'France',
4076 -                        u'Cher': u'France',
4077 -                        u'Chiapas': u'Mexique',
4078 -                        u'Chihuahua': u'Mexique',
4079 -                        u'Chili': u'Chili',
4080 -                        u'Chine': u'Chine',
4081 -                        u'Christmas (\xeele)': u'Christmas (\xeele)',
4082 -                        u'Chubut': u'Argentine',
4083 -                        u'Chypre': u'Chypre',
4084 -                        u'Ch\xedos': u'Gr\xe8ce',
4085 -                        u'Coahuila': u'Mexique',
4086 -                        u'Cocos (\xeeles)': u'Cocos (\xeeles)',
4087 -                        u'Colima': u'Mexique',
4088 -                        u'Colombie': u'Colombie',
4089 -                        u'Colombie britannique': u'Canada',
4090 -                        u'Colorado': u'\xc9tats-Unis',
4091 -                        u'Communaut\xe9 de Madrid': u'Espagne',
4092 -                        u'Communaut\xe9 de Valence': u'Espagne',
4093 -                        u'Comores': u'Comores',
4094 -                        u'Congo': u'Congo',
4095 -                        u'Congo (R\xe9publique d\xe9mocratique)': u'Congo (R\xe9publique d\xe9mocratique)',
4096 -                        u'Connecticut': u'\xc9tats-Unis',
4097 -                        u'Cook (\xeeles)': u'Cook (\xeeles)',
4098 -                        u'Corfou': u'Gr\xe8ce',
4099 -                        u'Corinthie': u'Gr\xe8ce',
4100 -                        u'Corrientes': u'Argentine',
4101 -                        u'Corr\xe8ze': u'France',
4102 -                        u'Corse': u'France',
4103 -                        u'Corse-du-Sud': u'France',
4104 -                        u'Cor\xe9e (R\xe9publique populaire d\xe9mocratique)': u'Cor\xe9e (R\xe9publique populaire d\xe9mocratique)',
4105 -                        u'Cor\xe9e (R\xe9publique)': u'Cor\xe9e (R\xe9publique)',
4106 -                        u'Cor\xe9e avant 1948': u'Cor\xe9e avant 1948',
4107 -                        u'Costa Rica': u'Costa Rica',
4108 -                        u'Creuse': u'France',
4109 -                        u'Croatie': u'Croatie',
4110 -                        u'Cr\xe8te': u'Gr\xe8ce',
4111 -                        u'Cuba': u'Cuba',
4112 -                        u'Cura\xe7ao': u'Cura\xe7ao',
4113 -                        u'Cyclades': u'Gr\xe8ce',
4114 -                        u'C\xe9phalonie': u'Gr\xe8ce',
4115 -                        u'C\xf3rdoba': u'Argentine',
4116 -                        u"C\xf4te d'Ivoire": u"C\xf4te d'Ivoire",
4117 -                        u"C\xf4te-d'Or": u'France',
4118 -                        u"C\xf4tes-d'Armor": u'France',
4119 -                        u'Dakota du Nord': u'\xc9tats-Unis',
4120 -                        u'Dakota du Sud': u'\xc9tats-Unis',
4121 -                        u'Danemark': u'Danemark',
4122 -                        u'Delaware': u'\xc9tats-Unis',
4123 -                        u'Deux-S\xe8vres': u'France',
4124 -                        u'District de Columbia': u'\xc9tats-Unis',
4125 -                        u'District f\xe9d\xe9ral': u'Br\xe9sil',
4126 -                        u'Djibouti': u'Djibouti',
4127 -                        u'Dod\xe9can\xe8se': u'Gr\xe8ce',
4128 -                        u'Dominique': u'Dominique',
4129 -                        u'Dordogne': u'France',
4130 -                        u'Doubs': u'France',
4131 -                        u'Drenthe': u'Pays-Bas',
4132 -                        u'Dr\xe1ma': u'Gr\xe8ce',
4133 -                        u'Dr\xf4me': u'France',
4134 -                        u'Durango': u'Mexique',
4135 -                        u'D\xe9pendance de Ross (Nouvelle-Z\xe9lande)': u'Antarctique',
4136 -                        u'El Salvador': u'El Salvador',
4137 -                        u'Entre-Rios': u'Argentine',
4138 -                        u'Espagne': u'Espagne',
4139 -                        u'Espirito Santo': u'Br\xe9sil',
4140 -                        u'Essonne': u'France',
4141 -                        u'Estonie': u'Estonie',
4142 -                        u'Estr\xe9madure': u'Espagne',
4143 -                        u'Eub\xe9e': u'Gr\xe8ce',
4144 -                        u'Eure': u'France',
4145 -                        u'Eure-et-Loir': u'France',
4146 -                        u'Eurytanie': u'Gr\xe8ce',
4147 -                        u'Fidji': u'Fidji',
4148 -                        u'Finist\xe8re': u'France',
4149 -                        u'Finlande': u'Finlande',
4150 -                        u'Flandre occidentale': u'Belgique',
4151 -                        u'Flandre orientale': u'Belgique',
4152 -                        u'Floride': u'\xc9tats-Unis',
4153 -                        u'Fl\xf3rina': u'Gr\xe8ce',
4154 -                        u'Formosa': u'Argentine',
4155 -                        u'France': u'France',
4156 -                        u'Franche-Comt\xe9': u'France',
4157 -                        u'Fribourg': u'Suisse',
4158 -                        u'Frioul-V\xe9n\xe9tie-Julienne': u'Italie',
4159 -                        u'Frise': u'Pays-Bas',
4160 -                        u'F\xe9ro\xe9 (\xeeles)': u'F\xe9ro\xe9 (\xeeles)',
4161 -                        u'Gabon': u'Gabon',
4162 -                        u'Galice': u'Espagne',
4163 -                        u'Gambie': u'Gambie',
4164 -                        u'Gard': u'France',
4165 -                        u'Gen\xe8ve': u'Suisse',
4166 -                        u'Gers': u'France',
4167 -                        u'Ghana': u'Ghana',
4168 -                        u'Gibraltar': u'Gibraltar',
4169 -                        u'Gironde': u'France',
4170 -                        u'Glaris': u'Suisse',
4171 -                        u'Goi\xe1s': u'Br\xe9sil',
4172 -                        u'Grande-Bretagne': u'Grande-Bretagne',
4173 -                        u'Grenade': u'Grenade',
4174 -                        u'Greven\xe1': u'Gr\xe8ce',
4175 -                        u'Grisons': u'Suisse',
4176 -                        u'Groenland': u'Groenland',
4177 -                        u'Groningue': u'Pays-Bas',
4178 -                        u'Gr\xe8ce': u'Gr\xe8ce',
4179 -                        u'Gr\xe8ce centrale': u'Gr\xe8ce',
4180 -                        u'Gr\xe8ce occidentale': u'Gr\xe8ce',
4181 -                        u'Guadeloupe': u'Guadeloupe',
4182 -                        u'Guam': u'Guam',
4183 -                        u'Guanajuato': u'Mexique',
4184 -                        u'Guatemala': u'Guatemala',
4185 -                        u'Gueldre': u'Pays-Bas',
4186 -                        u'Guernesey': u'Guernesey',
4187 -                        u'Guerrero': u'Mexique',
4188 -                        u'Guin\xe9e': u'Guin\xe9e',
4189 -                        u'Guin\xe9e \xe9quatoriale': u'Guin\xe9e \xe9quatoriale',
4190 -                        u'Guin\xe9e-Bissau': u'Guin\xe9e-Bissau',
4191 -                        u'Guyana': u'Guyana',
4192 -                        u'Guyane fran\xe7aise': u'Guyane fran\xe7aise',
4193 -                        u'G\xe9orgie': u'\xc9tats-Unis',
4194 -                        u'G\xe9orgie du Sud et les \xeeles Sandwich du Sud': u'G\xe9orgie du Sud et les \xeeles Sandwich du Sud',
4195 -                        u'Hainaut': u'Belgique',
4196 -                        u'Hambourg': u'Allemagne',
4197 -                        u'Haut-Rhin': u'France',
4198 -                        u'Haute-Autriche': u'Autriche',
4199 -                        u'Haute-Corse': u'France',
4200 -                        u'Haute-Garonne': u'France',
4201 -                        u'Haute-Loire': u'France',
4202 -                        u'Haute-Marne': u'France',
4203 -                        u'Haute-Normandie': u'France',
4204 -                        u'Haute-Savoie': u'France',
4205 -                        u'Haute-Sa\xf4ne': u'France',
4206 -                        u'Haute-Vienne': u'France',
4207 -                        u'Hautes-Alpes': u'France',
4208 -                        u'Hautes-Pyr\xe9n\xe9es': u'France',
4209 -                        u'Hauts-de-Seine': u'France',
4210 -                        u'Hawaii': u'\xc9tats-Unis',
4211 -                        u'Ha\xefti': u'Ha\xefti',
4212 -                        u'Heard (\xeele) et \xeeles McDonald': u'Heard (\xeele) et \xeeles McDonald',
4213 -                        u'Hesse': u'Allemagne',
4214 -                        u'Hidalgo': u'Mexique',
4215 -                        u'Hollande-M\xe9ridionale': u'Pays-Bas',
4216 -                        u'Hollande-Septentrionale': u'Pays-Bas',
4217 -                        u'Honduras': u'Honduras',
4218 -                        u'Hong Kong': u'Hong Kong',
4219 -                        u'Hongrie': u'Hongrie',
4220 -                        u'H\xe9rault': u'France',
4221 -                        u'Idaho': u'\xc9tats-Unis',
4222 -                        u'Ille-et-Vilaine': u'France',
4223 -                        u'Illinois': u'\xc9tats-Unis',
4224 -                        u'Inde': u'Inde',
4225 -                        u'Indiana': u'\xc9tats-Unis',
4226 -                        u'Indon\xe9sie': u'Indon\xe9sie',
4227 -                        u'Indre': u'France',
4228 -                        u'Indre-et-Loire': u'France',
4229 -                        u'Iowa': u'\xc9tats-Unis',
4230 -                        u'Io\xe1nnina': u'Gr\xe8ce',
4231 -                        u'Irak': u'Irak',
4232 -                        u'Iran': u'Iran',
4233 -                        u'Irlande': u'Irlande',
4234 -                        u'Ir\xe1kleion': u'Gr\xe8ce',
4235 -                        u'Islande': u'Islande',
4236 -                        u'Isra\xebl': u'Isra\xebl',
4237 -                        u'Is\xe8re': u'France',
4238 -                        u'Italie': u'Italie',
4239 -                        u'Jalisco': u'Mexique',
4240 -                        u'Jama\xefque': u'Jama\xefque',
4241 -                        u'Japon': u'Japon',
4242 -                        u'Jersey': u'Jersey',
4243 -                        u'Jordanie': u'Jordanie',
4244 -                        u'Jujuy': u'Argentine',
4245 -                        u'Jura': u'France',
4246 -                        u'Kansas': u'\xc9tats-Unis',
4247 -                        u'Kard\xedtsa': u'Gr\xe8ce',
4248 -                        u'Kastori\xe1': u'Gr\xe8ce',
4249 -                        u'Kav\xe1la': u'Gr\xe8ce',
4250 -                        u'Kazakhstan': u'Kazakhstan',
4251 -                        u'Kentucky': u'\xc9tats-Unis',
4252 -                        u'Kenya': u'Kenya',
4253 -                        u'Kilk\xeds': u'Gr\xe8ce',
4254 -                        u'Kirghizistan': u'Kirghizistan',
4255 -                        u'Kiribati': u'Kiribati',
4256 -                        u'Kosovo': u'Kosovo',
4257 -                        u'Kowe\xeft': u'Kowe\xeft',
4258 -                        u'Koz\xe1ni': u'Gr\xe8ce',
4259 -                        u'La Can\xe9e': u'Gr\xe8ce',
4260 -                        u'Laconie': u'Gr\xe8ce',
4261 -                        u'Landes': u'France',
4262 -                        u'Languedoc-Roussillon': u'France',
4263 -                        u'Laos': u'Laos',
4264 -                        u'Las\xedthi': u'Gr\xe8ce',
4265 -                        u'Latium': u'Italie',
4266 -                        u'Le Pir\xe9e': u'Gr\xe8ce',
4267 -                        u'Lesotho': u'Lesotho',
4268 -                        u'Lettonie': u'Lettonie',
4269 -                        u'Leucade': u'Gr\xe8ce',
4270 -                        u'Liban': u'Liban',
4271 -                        u'Liberia': u'Liberia',
4272 -                        u'Libye': u'Libye',
4273 -                        u'Liechtenstein': u'Liechtenstein',
4274 -                        u'Ligurie': u'Italie',
4275 -                        u'Limbourg': u'Pays-Bas',
4276 -                        u'Limousin': u'France',
4277 -                        u'Lituanie': u'Lituanie',
4278 -                        u'Li\xe8ge': u'Belgique',
4279 -                        u'Loir-et-Cher': u'France',
4280 -                        u'Loire': u'France',
4281 -                        u'Loire-Atlantique': u'France',
4282 -                        u'Loiret': u'France',
4283 -                        u'Lombardie': u'Italie',
4284 -                        u'Lorraine': u'France',
4285 -                        u'Lot': u'France',
4286 -                        u'Lot-et-Garonne': u'France',
4287 -                        u'Louisiane': u'\xc9tats-Unis',
4288 -                        u'Loz\xe8re': u'France',
4289 -                        u'Lucerne': u'Suisse',
4290 -                        u'Luxembourg': u'Belgique',
4291 -                        u'L\xe1risa': u'Gr\xe8ce',
4292 -                        u'L\xe9svos': u'Gr\xe8ce',
4293 -                        u'Macao': u'Macao',
4294 -                        u'Mac\xe9doine (R\xe9publique)': u'Mac\xe9doine (R\xe9publique)',
4295 -                        u'Mac\xe9doine centrale': u'Gr\xe8ce',
4296 -                        u'Mac\xe9doine occidentale': u'Gr\xe8ce',
4297 -                        u'Mac\xe9doine orientale et Thrace': u'Gr\xe8ce',
4298 -                        u'Madagascar': u'Madagascar',
4299 -                        u'Magn\xe9sie': u'Gr\xe8ce',
4300 -                        u'Maine': u'\xc9tats-Unis',
4301 -                        u'Maine-et-Loire': u'France',
4302 -                        u'Malaisie': u'Malaisie',
4303 -                        u'Malawi': u'Malawi',
4304 -                        u'Maldives': u'Maldives',
4305 -                        u'Mali': u'Mali',
4306 -                        u'Malouines (\xeeles)': u'Malouines (\xeeles)',
4307 -                        u'Malte': u'Malte',
4308 -                        u'Manche': u'France',
4309 -                        u'Manitoba': u'Canada',
4310 -                        u'Maranh\xe3o': u'Br\xe9sil',
4311 -                        u'Marches': u'Italie',
4312 -                        u'Mariannes du Nord (\xeeles)': u'Mariannes du Nord (\xeeles)',
4313 -                        u'Marne': u'France',
4314 -                        u'Maroc': u'Maroc',
4315 -                        u'Marshall (\xeeles)': u'Marshall (\xeeles)',
4316 -                        u'Martinique': u'Martinique',
4317 -                        u'Maryland': u'\xc9tats-Unis',
4318 -                        u'Massachusetts': u'\xc9tats-Unis',
4319 -                        u'Mato grosso': u'Br\xe9sil',
4320 -                        u'Mato grosso do Sul': u'Br\xe9sil',
4321 -                        u'Maurice': u'Maurice',
4322 -                        u'Mauritanie': u'Mauritanie',
4323 -                        u'Mayenne': u'France',
4324 -                        u'Mayotte': u'Mayotte',
4325 -                        u'Mecklembourg-Pom\xe9ranie ant\xe9rieure': u'Allemagne',
4326 -                        u'Melilla': u'Espagne',
4327 -                        u'Mendoza': u'Argentine',
4328 -                        u'Mess\xe9nie': u'Gr\xe8ce',
4329 -                        u'Meurthe-et-Moselle': u'France',
4330 -                        u'Meuse': u'France',
4331 -                        u'Mexico': u'Mexique',
4332 -                        u'Mexique': u'Mexique',
4333 -                        u'Michigan': u'\xc9tats-Unis',
4334 -                        u'Michoac\xe1n': u'Mexique',
4335 -                        u'Micron\xe9sie': u'Micron\xe9sie',
4336 -                        u'Midi-Pyr\xe9n\xe9es': u'France',
4337 -                        u'Minas Gerais': u'Br\xe9sil',
4338 -                        u'Minnesota': u'\xc9tats-Unis',
4339 -                        u'Misiones': u'Argentine',
4340 -                        u'Mississippi': u'\xc9tats-Unis',
4341 -                        u'Missouri': u'\xc9tats-Unis',
4342 -                        u'Moldavie': u'Moldavie',
4343 -                        u'Molise': u'Italie',
4344 -                        u'Monaco': u'Monaco',
4345 -                        u'Mongolie': u'Mongolie',
4346 -                        u'Montana': u'\xc9tats-Unis',
4347 -                        u'Montserrat': u'Montserrat',
4348 -                        u'Mont\xe9n\xe9gro': u'Mont\xe9n\xe9gro',
4349 -                        u'Morbihan': u'France',
4350 -                        u'Morelos': u'Mexique',
4351 -                        u'Moselle': u'France',
4352 -                        u'Mozambique': u'Mozambique',
4353 -                        u'Murcie': u'Espagne',
4354 -                        u'Myanmar': u'Myanmar',
4355 -                        u'Namibie': u'Namibie',
4356 -                        u'Namur': u'Belgique',
4357 -                        u'Nauru': u'Nauru',
4358 -                        u'Navarre': u'Espagne',
4359 -                        u'Nayarit': u'Mexique',
4360 -                        u'Nebraska': u'\xc9tats-Unis',
4361 -                        u'Neuch\xe2tel': u'Suisse',
4362 -                        u'Neuqu\xe9n': u'Argentine',
4363 -                        u'Nevada': u'\xc9tats-Unis',
4364 -                        u'New Hampshire': u'\xc9tats-Unis',
4365 -                        u'New Jersey': u'\xc9tats-Unis',
4366 -                        u'New York': u'\xc9tats-Unis',
4367 -                        u'Nicaragua': u'Nicaragua',
4368 -                        u'Nidwald': u'Suisse',
4369 -                        u'Niger': u'Niger',
4370 -                        u'Nigeria': u'Nigeria',
4371 -                        u'Niue': u'Niue',
4372 -                        u'Ni\xe8vre': u'France',
4373 -                        u'Nord': u'France',
4374 -                        u'Nord-Pas-de-Calais': u'France',
4375 -                        u'Norfolk (\xeele)': u'Norfolk (\xeele)',
4376 -                        u'Norv\xe8ge': u'Norv\xe8ge',
4377 -                        u'Nouveau Mexique': u'\xc9tats-Unis',
4378 -                        u'Nouveau-Brunswick': u'Canada',
4379 -                        u'Nouvelle-Cal\xe9donie': u'Nouvelle-Cal\xe9donie',
4380 -                        u'Nouvelle-Galles-du-Sud': u'Australie',
4381 -                        u'Nouvelle-Z\xe9lande': u'Nouvelle-Z\xe9lande',
4382 -                        u'Nouvelle-\xc9cosse': u'Canada',
4383 -                        u'Nuevo Le\xf3n': u'Mexique',
4384 -                        u'N\xe9pal': u'N\xe9pal',
4385 -                        u'Oaxaca': u'Mexique',
4386 -                        u'Obwald': u'Suisse',
4387 -                        u'Ohio': u'\xc9tats-Unis',
4388 -                        u'Oise': u'France',
4389 -                        u'Oklahoma': u'\xc9tats-Unis',
4390 -                        u'Oman': u'Oman',
4391 -                        u'Ombrie': u'Italie',
4392 -                        u'Ontario': u'Canada',
4393 -                        u'Oregon': u'\xc9tats-Unis',
4394 -                        u'Orne': u'France',
4395 -                        u'Ouganda': u'Ouganda',
4396 -                        u'Ouzb\xe9kistan': u'Ouzb\xe9kistan',
4397 -                        u'Overijssell': u'Pays-Bas',
4398 -                        u'Pakistan': u'Pakistan',
4399 -                        u'Palau': u'Palau',
4400 -                        u'Pampa': u'Argentine',
4401 -                        u'Panama': u'Panama',
4402 -                        u'Papouasie-Nouvelle-Guin\xe9e': u'Papouasie-Nouvelle-Guin\xe9e',
4403 -                        u'Paraguay': u'Paraguay',
4404 -                        u'Paraiba': u'Br\xe9sil',
4405 -                        u'Param\xe1': u'Br\xe9sil',
4406 -                        u'Paris': u'France',
4407 -                        u'Par\xe1': u'Br\xe9sil',
4408 -                        u'Pas-de-Calais': u'France',
4409 -                        u'Pays Basque': u'Espagne',
4410 -                        u'Pays-Bas': u'Pays-Bas',
4411 -                        u'Pays-Bas avant 1830': u'Pays-Bas avant 1830',
4412 -                        u'Pays-de-la-Loire': u'France',
4413 -                        u'Pennsylvanie': u'\xc9tats-Unis',
4414 -                        u'Pernambouc': u'Br\xe9sil',
4415 -                        u'Philippines': u'Philippines',
4416 -                        u'Phocide': u'Gr\xe8ce',
4417 -                        u'Phtiotide': u'Gr\xe8ce',
4418 -                        u'Piau\xed': u'Br\xe9sil',
4419 -                        u'Picardie': u'France',
4420 -                        u'Pitcairn': u'Pitcairn',
4421 -                        u'Pi\xe9mont': u'Italie',
4422 -                        u'Pi\xe9rie': u'Gr\xe8ce',
4423 -                        u'Poitou-Charentes': u'France',
4424 -                        u'Pologne': u'Pologne',
4425 -                        u'Polyn\xe9sie fran\xe7aise': u'Polyn\xe9sie fran\xe7aise',
4426 -                        u'Porto Rico': u'Porto Rico',
4427 -                        u'Portugal': u'Portugal',
4428 -                        u'Pouilles': u'Italie',
4429 -                        u"Provence-Alpes-C\xf4te d'Azur": u'France',
4430 -                        u'Pr\xe9veza': u'Gr\xe8ce',
4431 -                        u'Puebla': u'Mexique',
4432 -                        u'Puy-de-D\xf4me': u'France',
4433 -                        u'Pyr\xe9n\xe9es-Atlantiques': u'France',
4434 -                        u'Pyr\xe9n\xe9es-Orientales': u'France',
4435 -                        u'P\xe9lla': u'Gr\xe8ce',
4436 -                        u'P\xe9loponn\xe8se': u'Gr\xe8ce',
4437 -                        u'P\xe9rou': u'P\xe9rou',
4438 -                        u'Qatar': u'Qatar',
4439 -                        u'Queensland': u'Australie',
4440 -                        u'Quer\xe9taro': u'Mexique',
4441 -                        u'Quintana Roo': u'Mexique',
4442 -                        u'Qu\xe9bec': u'Canada',
4443 -                        u'Rhode Island': u'\xc9tats-Unis',
4444 -                        u'Rhodope': u'Gr\xe8ce',
4445 -                        u'Rh\xe9nanie-Palatinat': u'Allemagne',
4446 -                        u'Rh\xe9nanie-du-Nord-Westphalie': u'Allemagne',
4447 -                        u'Rh\xf4ne': u'France',
4448 -                        u'Rh\xf4ne-Alpes': u'France',
4449 -                        u'Rio Grande do Norte': u'Br\xe9sil',
4450 -                        u'Rio Grande do Sul': u'Br\xe9sil',
4451 -                        u'Rio Negro': u'Argentine',
4452 -                        u'Rio de Janeiro': u'Br\xe9sil',
4453 -                        u'Rioja': u'Argentine',
4454 -                        u'Rond\xf4nia': u'Br\xe9sil',
4455 -                        u'Roraima': u'Br\xe9sil',
4456 -                        u'Roumanie': u'Roumanie',
4457 -                        u'Royaume-Uni': u'Grande-Bretagne',
4458 -                        u'Russie (F\xe9d\xe9ration)': u'Russie (F\xe9d\xe9ration)',
4459 -                        u'Rwanda': u'Rwanda',
4460 -                        u'R\xc3\xa9publique Tch\xc3\xa8que': u'R\xc3\xa9publique tch\xc3\xa8que',
4461 -                        u'R\xe9publique dominicaine': u'R\xe9publique dominicaine',
4462 -                        u'R\xe9publique tch\xe8que': u'R\xe9publique tch\xe8que',
4463 -                        u'R\xe9thymnon': u'Gr\xe8ce',
4464 -                        u'R\xe9union': u'R\xe9union',
4465 -                        u'Sahara occidental': u'Sahara occidental',
4466 -                        u'Saint-Barth\xe9lemy': u'Saint-Barth\xe9lemy',
4467 -                        u'Saint-Gall': u'Suisse',
4468 -                        u'Saint-Kitts-et-Nevis': u'Saint-Kitts-et-Nevis',
4469 -                        u'Saint-Marin': u'Saint-Marin',
4470 -                        u'Saint-Martin (partie fran\xe7aise)': u'Saint-Martin (partie fran\xe7aise)',
4471 -                        u'Saint-Martin (partie n\xe9erlandaise)': u'Saint-Martin (partie n\xe9erlandaise)',
4472 -                        u'Saint-Pierre-et-Miquelon': u'Saint-Pierre-et-Miquelon',
4473 -                        u'Saint-Vincent-et-les Grenadines': u'Saint-Vincent-et-les Grenadines',
4474 -                        u'Sainte-H\xe9l\xe8ne': u'Sainte-H\xe9l\xe8ne',
4475 -                        u'Sainte-Lucie': u'Sainte-Lucie',
4476 -                        u'Salomon (\xeeles)': u'Salomon (\xeeles)',
4477 -                        u'Salta': u'Argentine',
4478 -                        u'Salzbourg': u'Autriche',
4479 -                        u'Samoa': u'Samoa',
4480 -                        u'Samoa am\xe9ricaines': u'Samoa am\xe9ricaines',
4481 -                        u'San Juan': u'Argentine',
4482 -                        u'San Luis': u'Argentine',
4483 -                        u'San Luis Potos\xed': u'Mexique',
4484 -                        u'Santa Catarina': u'Br\xe9sil',
4485 -                        u'Santa Cruz': u'Argentine',
4486 -                        u'Santa Fe': u'Argentine',
4487 -                        u'Santiago del Estero': u'Argentine',
4488 -                        u'Sao Tom\xe9-et-Principe': u'Sao Tom\xe9-et-Principe',
4489 -                        u'Sardaigne': u'Italie',
4490 -                        u'Sarre': u'Allemagne',
4491 -                        u'Sarthe': u'France',
4492 -                        u'Saskatchewan': u'Canada',
4493 -                        u'Savoie': u'France',
4494 -                        u'Saxe': u'Allemagne',
4495 -                        u'Saxe-Anhalt': u'Allemagne',
4496 -                        u'Sa\xf4ne-et-Loire': u'France',
4497 -                        u'Schaffhouse': u'Suisse',
4498 -                        u'Schleswig-Holstein': u'Allemagne',
4499 -                        u'Schwyz': u'Suisse',
4500 -                        u'Seine-Maritime': u'France',
4501 -                        u'Seine-Saint-Denis': u'France',
4502 -                        u'Seine-et-Marne': u'France',
4503 -                        u'Serbie': u'Serbie',
4504 -                        u'Serbie-et-Mont\xe9n\xe9gro': u'Serbie-et-Mont\xe9n\xe9gro',
4505 -                        u'Sergipe': u'Br\xe9sil',
4506 -                        u'Seychelles': u'Seychelles',
4507 -                        u'Sicile': u'Italie',
4508 -                        u'Sierra Leone': u'Sierra Leone',
4509 -                        u'Sinaloa': u'Mexique',
4510 -                        u'Singapour': u'Singapour',
4511 -                        u'Slovaquie': u'Slovaquie',
4512 -                        u'Slov\xe9nie': u'Slov\xe9nie',
4513 -                        u'Soleure': u'Suisse',
4514 -                        u'Somalie': u'Somalie',
4515 -                        u'Somme': u'France',
4516 -                        u'Sonora': u'Mexique',
4517 -                        u'Soudan': u'Soudan',
4518 -                        u'Soudan du Sud': u'Soudan du Sud',
4519 -                        u'Sri Lanka': u'Sri Lanka',
4520 -                        u'Styrie': u'Autriche',
4521 -                        u'Suisse': u'Suisse',
4522 -                        u'Suriname': u'Suriname',
4523 -                        u'Su\xe8de': u'Su\xe8de',
4524 -                        u'Svalbard et \xeele Jan Mayen': u'Svalbard et \xeele Jan Mayen',
4525 -                        u'Swaziland': u'Swaziland',
4526 -                        u'Syrie': u'Syrie',
4527 -                        u'S\xe1mos': u'Gr\xe8ce',
4528 -                        u'S\xe3o Paulo': u'Br\xe9sil',
4529 -                        u'S\xe9n\xe9gal': u'S\xe9n\xe9gal',
4530 -                        u'S\xe9rrai': u'Gr\xe8ce',
4531 -                        u'Tabasco': u'Mexique',
4532 -                        u'Tadjikistan': u'Tadjikistan',
4533 -                        u'Tamaulipas': u'Mexique',
4534 -                        u'Tanzanie': u'Tanzanie',
4535 -                        u'Tarn': u'France',
4536 -                        u'Tarn-et-Garonne': u'France',
4537 -                        u'Tasmanie': u'Australie',
4538 -                        u'Ta\xefwan': u'Ta\xefwan',
4539 -                        u'Tchad': u'Tchad',
4540 -                        u'Tch\xe9coslovaquie': u'Tch\xe9coslovaquie',
4541 -                        u'Tennessee': u'\xc9tats-Unis',
4542 -                        u'Terre de Feu': u'Argentine',
4543 -                        u'Terre de la Reine-Maud (Norv\xe8ge)': u'Antarctique',
4544 -                        u'Terre-Neuve': u'Canada',
4545 -                        u'Terres australes et antarctiques fran\xe7aises': u'Antarctique',
4546 -                        u'Terres australes fran\xe7aises': u'Terres australes fran\xe7aises',
4547 -                        u'Territoire antarctique australien': u'Antarctique',
4548 -                        u'Territoire antarctique britannique': u'Antarctique',
4549 -                        u"Territoire britannique de l'Oc\xe9an indien": u"Territoire britannique de l'Oc\xe9an indien",
4550 -                        u'Territoire de la capitale australienne': u'Australie',
4551 -                        u'Territoire du Nord': u'Australie',
4552 -                        u'Territoire du Yukon': u'Canada',
4553 -                        u'Territoire-de-Belfort': u'France',
4554 -                        u'Territoires du Nord-Ouest': u'Canada',
4555 -                        u'Tessin': u'Suisse',
4556 -                        u'Texas': u'\xc9tats-Unis',
4557 -                        u'Tha\xeflande': u'Tha\xeflande',
4558 -                        u'Thesprotie': u'Gr\xe8ce',
4559 -                        u'Thessalie': u'Gr\xe8ce',
4560 -                        u'Thessalonique': u'Gr\xe8ce',
4561 -                        u'Thurgovie': u'Suisse',
4562 -                        u'Thuringe': u'Allemagne',
4563 -                        u'Timor oriental': u'Timor oriental',
4564 -                        u'Tlaxcala': u'Mexique',
4565 -                        u'Togo': u'Togo',
4566 -                        u'Tokelau': u'Tokelau',
4567 -                        u'Tonga': u'Tonga',
4568 -                        u'Toscane': u'Italie',
4569 -                        u'Trentin-Haut-Adige': u'Italie',
4570 -                        u'Trinit\xe9-et-Tobago': u'Trinit\xe9-et-Tobago',
4571 -                        u'Tr\xedkala': u'Gr\xe8ce',
4572 -                        u'Tucum\xe1n': u'Argentine',
4573 -                        u'Tunisie': u'Tunisie',
4574 -                        u'Turkm\xe9nistan': u'Turkm\xe9nistan',
4575 -                        u'Turks et Ca\xefques (\xeeles)': u'Turks et Ca\xefques (\xeeles)',
4576 -                        u'Turquie': u'Turquie',
4577 -                        u'Tuvalu': u'Tuvalu',
4578 -                        u'Tyrol': u'Autriche',
4579 -                        u'URSS': u'URSS',
4580 -                        u'US': u'\xc9tats-Unis',
4581 -                        'USA': u'\xc9tats-Unis',
4582 -                        u'Ukraine': u'Ukraine',
4583 -                        u'Uri': u'Suisse',
4584 -                        u'Uruguay': u'Uruguay',
4585 -                        u'Utah': u'\xc9tats-Unis',
4586 -                        u'Utrecht': u'Pays-Bas',
4587 -                        u"Val d'Aoste": u'Italie',
4588 -                        u"Val-d'Oise": u'France',
4589 -                        u'Val-de-Marne': u'France',
4590 -                        u'Valais': u'Suisse',
4591 -                        u'Vanuatu': u'Vanuatu',
4592 -                        u'Var': u'France',
4593 -                        u'Vatican': u'Vatican',
4594 -                        u'Vaucluse': u'France',
4595 -                        u'Vaud': u'Suisse',
4596 -                        u'Vend\xe9e': u'France',
4597 -                        u'Venezuela': u'Venezuela',
4598 -                        u'Veracruz': u'Mexique',
4599 -                        u'Vermont': u'\xc9tats-Unis',
4600 -                        u'Victoria': u'Australie',
4601 -                        u'Vienne': u'Autriche',
4602 -                        u'Viet Nam': u'Viet Nam',
4603 -                        u'Virginie': u'\xc9tats-Unis',
4604 -                        u'Virginie occidentale': u'\xc9tats-Unis',
4605 -                        u'Vorarlberg': u'Autriche',
4606 -                        u'Vosges': u'France',
4607 -                        u'V\xe9n\xe9tie': u'Italie',
4608 -                        u'Wallis et Futuna (\xeeles)': u'Wallis et Futuna (\xeeles)',
4609 -                        u'Washington': u'\xc9tats-Unis',
4610 -                        u'Wisconsin': u'\xc9tats-Unis',
4611 -                        u'Wyoming': u'\xc9tats-Unis',
4612 -                        u'X\xe1nthi': u'Gr\xe8ce',
4613 -                        u'Yonne': u'France',
4614 -                        u'Yougoslavie': u'Yougoslavie',
4615 -                        u'Yucat\xe1n': u'Mexique',
4616 -                        u'Yvelines': u'France',
4617 -                        u'Y\xe9men': u'Y\xe9men',
4618 -                        u'Y\xe9men (R\xe9publique d\xe9mocratique populaire)': u'Y\xe9men (R\xe9publique d\xe9mocratique populaire)',
4619 -                        u'Zacatecas': u'Mexique',
4620 -                        u'Zambie': u'Zambie',
4621 -                        u'Zimbabwe': u'Zimbabwe',
4622 -                        u'Zoug': u'Suisse',
4623 -                        u'Zurich': u'Suisse',
4624 -                        u'Z\xe1kynthos': u'Gr\xe8ce',
4625 -                        u'Z\xe9lande': u'Pays-Bas',
4626 -                        u'aire g\xe9ographique ancienne': u'aire g\xe9ographique ancienne',
4627 -                        u'code non adapt\xe9': u'code non adapt\xe9',
4628 -                        u'inconnu': u'inconnu',
4629 -                        u'intergouvernemental': u'intergouvernemental',
4630 -                        u'multiple': u'multiple',
4631 -                        u"ne s'applique pas": u"ne s'applique pas",
4632 -                        u'non renseign\xe9': u'non renseign\xe9',
4633 -                        u'\xc1rta': u'Gr\xe8ce',
4634 -                        u'\xc9gypte': u'\xc9gypte',
4635 -                        u'\xc9lide': u'Gr\xe8ce',
4636 -                        u'\xc9mathie': u'Gr\xe8ce',
4637 -                        u'\xc9milie-Romagne': u'Italie',
4638 -                        u'\xc9mirats arabes unis': u'\xc9mirats arabes unis',
4639 -                        u'\xc9pire': u'Gr\xe8ce',
4640 -                        u'\xc9quateur': u'\xc9quateur',
4641 -                        u'\xc9rythr\xe9e': u'\xc9rythr\xe9e',
4642 -                        u'\xc9tats-Unis': u'\xc9tats-Unis',
4643 -                        u'\xc9thiopie': u'\xc9thiopie',
4644 -                        u'\xc9tolie-et-Acarnanie': u'Gr\xe8ce',
4645 -                        u'\xc9vros': u'Gr\xe8ce',
4646 -                        u'\xcele Pierre 1er (Norv\xe8ge)': u'Antarctique',
4647 -                        u'\xcele de Man': u'\xcele de Man',
4648 -                        u'\xcele du Prince-\xc9douard': u'Canada',
4649 -                        u'\xcele-de-France': u'France',
4650 -                        u'\xceles Bal\xe9ares': u'Espagne',
4651 -                        u'\xceles Ioniennes': u'Gr\xe8ce',
4652 -                        u'\xceles Vierges am\xe9ricaines': u'\xceles Vierges am\xe9ricaines',
4653 -                        u'\xceles Vierges britanniques': u'\xceles Vierges britanniques',
4654 -                        u'\xceles de la Mer \xc9g\xe9e m\xe9ridionale': u'Gr\xe8ce',
4655 -                        u'\xceles de la Mer \xc9g\xe9e septentrionale': u'Gr\xe8ce',
4656 -                        u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis': u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis'
4657 -                                                }
diff --git a/reference_data/countries_iso_3166.txt b/reference_data/countries_iso_3166.txt
@@ -1,269 +0,0 @@
4658 -##,non renseigné
4659 -..,non renseigné
4660 -aa,aire géographique ancienne
4661 -ad,Andorre
4662 -ae,Émirats arabes unis
4663 -af,Afghanistan
4664 -ag,Antigua-et-Barbuda
4665 -ai,Anguilla
4666 -al,Albanie
4667 -am,Arménie
4668 -an,Antilles néerlandaises
4669 -ao,Angola
4670 -aq,Antarctique
4671 -ar,Argentine
4672 -as,Samoa américaines
4673 -at,Autriche
4674 -au,Australie
4675 -aw,Aruba
4676 -ax,Aland (îles)
4677 -az,Azerbaïdjan
4678 -ba,Bosnie-Herzégovine
4679 -bb,Barbade
4680 -bd,Bangladesh
4681 -be,Belgique
4682 -bf,Burkina
4683 -bg,Bulgarie
4684 -bh,Bahreïn
4685 -bi,Burundi
4686 -bj,Bénin
4687 -bl,Saint-Barthélemy
4688 -bm,Bermudes
4689 -bn,Brunéi
4690 -bo,Bolivie
4691 -bq,Bonaire, Saint-Eustache et Saba,Saba###Saint-Eustache
4692 -br,Brésil
4693 -bs,Bahamas
4694 -bt,Bhoutan
4695 -bv,Bouvet (île)
4696 -bw,Botswana
4697 -by,Biélorussie,Bélarus
4698 -bz,Belize
4699 -ca,Canada
4700 -cc,Cocos (îles),Keeling (îles)
4701 -cd,Congo (République démocratique),Zaïre
4702 -cf,Centrafrique,République centrafricaine
4703 -cg,Congo,Congo (République)
4704 -ch,Suisse,Confédération helvétique
4705 -ci,Côte d'Ivoire
4706 -ck,Cook (îles)
4707 -cl,Chili
4708 -cm,Cameroun
4709 -cn,Chine,Chine (République populaire)
4710 -co,Colombie
4711 -cr,Costa Rica
4712 -cs,Serbie-et-Monténégro
4713 -cu,Cuba
4714 -cv,Cap-Vert
4715 -cw,Curaçao
4716 -cx,Christmas (île)
4717 -cy,Chypre
4718 -cz,République tchèque,Tchèque, République
4719 -dd,Allemagne (République démocratique)
4720 -de,Allemagne,Allemagne (République fédérale)
4721 -dj,Djibouti
4722 -dk,Danemark
4723 -dm,Dominique
4724 -do,République dominicaine,Dominicaine, République
4725 -dz,Algérie
4726 -ec,Équateur
4727 -ee,Estonie
4728 -eg,Égypte
4729 -eh,Sahara occidental
4730 -er,Érythrée
4731 -es,Espagne
4732 -et,Éthiopie
4733 -fi,Finlande
4734 -fj,Fidji
4735 -fk,Malouines (îles),Falkland (îles)
4736 -fm,Micronésie,États fédérés de Micronésie
4737 -fo,Féroé (îles)
4738 -fr,France
4739 -ga,Gabon
4740 -gb,Grande-Bretagne,Royaume-Uni
4741 -gd,Grenade
4742 -ge,Géorgie
4743 -gf,Guyane française
4744 -gg,Guernesey
4745 -gh,Ghana
4746 -gi,Gibraltar
4747 -gl,Groenland
4748 -gm,Gambie
4749 -gn,Guinée
4750 -gp,Guadeloupe
4751 -gq,Guinée équatoriale
4752 -gr,Grèce
4753 -gs,Géorgie du Sud et les îles Sandwich du Sud
4754 -gt,Guatemala
4755 -gu,Guam
4756 -gw,Guinée-Bissau
4757 -gy,Guyana
4758 -hk,Hong Kong
4759 -hm,Heard (île) et îles McDonald
4760 -hn,Honduras
4761 -hr,Croatie
4762 -ht,Haïti
4763 -hu,Hongrie
4764 -id,Indonésie
4765 -ie,Irlande
4766 -ii,intergouvernemental
4767 -il,Israël
4768 -im,Île de Man,Man, Île de
4769 -in,Inde
4770 -io,Territoire britannique de l'Océan indien,Chagos (îles)###Océan indien, Territoire britannique de l'
4771 -iq,Irak
4772 -ir,Iran
4773 -is,Islande
4774 -it,Italie
4775 -je,Jersey
4776 -jm,Jamaïque
4777 -jo,Jordanie
4778 -jp,Japon
4779 -ke,Kenya
4780 -kg,Kirghizistan
4781 -kh,Cambodge
4782 -ki,Kiribati
4783 -km,Comores
4784 -kn,Saint-Kitts-et-Nevis,Saint-Christophe-et-Nevis
4785 -ko,Kosovo
4786 -kp,Corée (République populaire démocratique),Corée du Nord
4787 -kr,Corée (République),Corée du Sud
4788 -kw,Koweït
4789 -ky,Cayman,Caïmanes, Îles###Caïman (îles)
4790 -kz,Kazakhstan
4791 -la,Laos
4792 -lb,Liban
4793 -lc,Sainte-Lucie
4794 -li,Liechtenstein
4795 -lk,Sri Lanka
4796 -lr,Liberia
4797 -ls,Lesotho
4798 -lt,Lituanie
4799 -lu,Luxembourg
4800 -lv,Lettonie
4801 -ly,Libye
4802 -ma,Maroc
4803 -mc,Monaco
4804 -md,Moldavie,Moldova, République de
4805 -me,Monténégro
4806 -mf,Saint-Martin (partie française)
4807 -mg,Madagascar
4808 -mh,Marshall (îles)
4809 -mk,Macédoine (République)
4810 -ml,Mali
4811 -mm,Myanmar,Birmanie
4812 -mn,Mongolie
4813 -mo,Macao
4814 -mp,Mariannes du Nord (îles)
4815 -mq,Martinique
4816 -mr,Mauritanie
4817 -ms,Montserrat
4818 -mt,Malte
4819 -mu,Maurice
4820 -mv,Maldives
4821 -mw,Malawi
4822 -mx,Mexique
4823 -my,Malaisie
4824 -mz,Mozambique
4825 -na,Namibie
4826 -nc,Nouvelle-Calédonie
4827 -ne,Niger
4828 -nf,Norfolk (île)
4829 -ng,Nigeria
4830 -ni,Nicaragua
4831 -nl,Pays-Bas
4832 -no,Norvège
4833 -np,Népal
4834 -nr,Nauru
4835 -nu,Niue
4836 -nz,Nouvelle-Zélande
4837 -om,Oman
4838 -oo,code non adapté
4839 -pa,Panama
4840 -pe,Pérou
4841 -pf,Polynésie française
4842 -pg,Papouasie-Nouvelle-Guinée
4843 -ph,Philippines
4844 -pk,Pakistan
4845 -pl,Pologne
4846 -pm,Saint-Pierre-et-Miquelon
4847 -pn,Pitcairn
4848 -pr,Porto Rico
4849 -ps,Autorité palestinienne,Palestine
4850 -pt,Portugal
4851 -pw,Palau,Palaos
4852 -py,Paraguay
4853 -qa,Qatar
4854 -re,Réunion
4855 -ro,Roumanie
4856 -rs,Serbie
4857 -ru,Russie (Fédération),Russie
4858 -rw,Rwanda
4859 -sa,Arabie saoudite
4860 -sb,Salomon (îles)
4861 -sc,Seychelles
4862 -sd,Soudan
4863 -se,Suède
4864 -sg,Singapour
4865 -sh,Sainte-Hélène,Ascension (île)###Tristan da Cunha (île)
4866 -si,Slovénie
4867 -sj,Svalbard et île Jan Mayen
4868 -sk,Slovaquie
4869 -sl,Sierra Leone
4870 -sm,Saint-Marin
4871 -sn,Sénégal
4872 -so,Somalie
4873 -sr,Suriname
4874 -ss,Soudan du Sud,Sud Soudan
4875 -st,Sao Tomé-et-Principe
4876 -su,URSS
4877 -sv,El Salvador,Salvador
4878 -sx,Saint-Martin (partie néerlandaise),Sint Maarten
4879 -sy,Syrie
4880 -sz,Swaziland
4881 -tc,Turks et Caïques (îles)
4882 -td,Tchad
4883 -tf,Terres australes françaises
4884 -tg,Togo
4885 -th,Thaïlande
4886 -tj,Tadjikistan
4887 -tk,Tokelau
4888 -tl,Timor oriental
4889 -tm,Turkménistan
4890 -tn,Tunisie
4891 -to,Tonga
4892 -tr,Turquie
4893 -tt,Trinité-et-Tobago
4894 -tv,Tuvalu
4895 -tw,Taïwan,Chine (République)
4896 -tz,Tanzanie
4897 -ua,Ukraine
4898 -ug,Ouganda
4899 -um,Îles mineures éloignées des États-Unis
4900 -us,États-Unis
4901 -uy,Uruguay
4902 -uz,Ouzbékistan
4903 -va,Vatican,Saint-Siège
4904 -vc,Saint-Vincent-et-les Grenadines
4905 -ve,Venezuela
4906 -vg,Îles Vierges britanniques,Vierges (îles) britanniques
4907 -vi,Îles Vierges américaines,Vierges (îles) américaines
4908 -vn,Viet Nam
4909 -vu,Vanuatu
4910 -wf,Wallis et Futuna (îles)
4911 -ws,Samoa,Samoa occidentales
4912 -xc,Tchécoslovaquie
4913 -xd,Allemagne avant 1945
4914 -xe,Europe,Union européenne
4915 -xk,Corée avant 1948
4916 -xn,Pays-Bas avant 1830,Belgique avant 1830
4917 -xx,inconnu
4918 -yd,Yémen (République démocratique populaire),Sud Yémen
4919 -ye,Yémen
4920 -yt,Mayotte
4921 -yu,Yougoslavie
4922 -yy,ne s'applique pas
4923 -za,Afrique du Sud
4924 -zm,Zambie
4925 -zw,Zimbabwe
4926 -zz,multiple
diff --git a/reference_data/stopwords.py b/reference_data/stopwords.py
@@ -1,15 +0,0 @@
4927 -# -*- coding: utf-8 -*-
4928 -"""
4929 -Stopwords in different languages.
4930 -"""
4931 -
4932 -FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
4933 -
4934 -
4935 -ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
4936 -
4937 -
4938 -ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
4939 -
4940 -
4941 -ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
diff --git a/reference_data/us_states.py b/reference_data/us_states.py
@@ -1,211 +0,0 @@
4942 -# -*- coding: utf-8 -*-
4943 -
4944 -# See http://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations
4945 -# WARNING: The name of each state should be in French
4946 -# (e.g. "Floride", not "Florida")
4947 -US_STATES = {'AK': 'Alaska',
4948 -             'AL': 'Alabama',
4949 -             'AR': 'Arkansas',
4950 -             'AZ': 'Arizona',
4951 -             'Ala.': 'Alabama',
4952 -             'Alas.': 'Alaska',
4953 -             'Alaska': 'Alaska',
4954 -             'Ariz.': 'Arizona',
4955 -             'Ark.': 'Arkansas',
4956 -             'Az.': 'Arizona',
4957 -             'CA': 'Californie',
4958 -             'CF': 'Californie',
4959 -             'CL': 'Colorado',
4960 -             'CO': 'Colorado',
4961 -             'CT': 'Connecticut',
4962 -             'Ca.': 'Californie',
4963 -             'Cal.': 'Californie',
4964 -             'Cali.': 'Californie',
4965 -             'Calif.': 'Californie',
4966 -             'Col.': 'Colorado',
4967 -             'Colo.': 'Colorado',
4968 -             'Conn.': 'Connecticut',
4969 -             'Ct.': 'Connecticut',
4970 -             'D.C.': 'District of ColuFederal district',
4971 -             'DC': 'District of ColuFederal district',
4972 -             'DE': 'Delaware',
4973 -             'DL': 'Delaware',
4974 -             'De.': 'Delaware',
4975 -             'Del.': 'Delaware',
4976 -             'FL': 'Floride',
4977 -             'Fl.': 'Floride',
4978 -             'Fla.': 'Floride',
4979 -             'Flor.': 'Floride',
4980 -             'GA': u'Géorgie',
4981 -             'Ga.': u'Géorgie',
4982 -             'H.I.': 'Hawaii',
4983 -             'HA': 'Hawaii',
4984 -             'HI': 'Hawaii',
4985 -             'Hawaii': 'Hawaii',
4986 -             'IA': 'Iowa',
4987 -             'ID': 'Idaho',
4988 -             'IL': 'Illinois',
4989 -             'IN': 'Indiana',
4990 -             'Ia.': 'Iowa',
4991 -             'Id.': 'Idaho',
4992 -             'Ida.': 'Idaho',
4993 -             'Idaho': 'Idaho',
4994 -             'Il.': 'Illinois',
4995 -             "Ill's": 'Illinois',
4996 -             'Ill.': 'Illinois',
4997 -             'Ills.': 'Illinois',
4998 -             'In.': 'Indiana',
4999 -             'Ind.': 'Indiana',
5000 -             'Ioa.': 'Iowa',
5001 -             'Iowa': 'Iowa',
5002 -             'KA': 'Kansas',
5003 -             'KS': 'Kansas',
5004 -             'KY': 'Kentucky',
5005 -             'Ka.': 'Kansas',
5006 -             'Kan.': 'Kansas',
5007 -             'Kans.': 'Kansas',
5008 -             'Ks.': 'Kansas',
5009 -             'Ky.': 'Kentucky',
5010 -             'LA': 'Louisiane',
5011 -             'La.': 'Louisiane',
5012 -             'MA': 'Massachusetts',
5013 -             'MC': 'Michigan',
5014 -             'MD': 'Maryland',
5015 -             'ME': 'Maine',
5016 -             'MI': 'Mississippi',
5017 -             'MN': 'Minnesota',
5018 -             'MO': 'Missouri',
5019 -             'MS': 'Mississippi',
5020 -             'MT': 'Montana',
5021 -             'Maine': 'Maine',
5022 -             'Mass.': 'Massachusetts',
5023 -             'Md.': 'Maryland',
5024 -             'Me.': 'Maine',
5025 -             'Mich.': 'Michigan',
5026 -             'Minn.': 'Minnesota',
5027 -             'Miss.': 'Mississippi',
5028 -             'Mn.': 'Minnesota',
5029 -             'Mo.': 'Missouri',
5030 -             'Mont.': 'Montana',
5031 -             'N. Car.': 'Caroline du Nord',
5032 -             'N. Dak.': 'Dakota du Nord',
5033 -             'N. Mex.': 'Nouveau-Mexique',
5034 -             'N. York': 'New York',
5035 -             'N.C.': 'Caroline du Nord',
5036 -             'N.D.': 'Dakota du Nord',
5037 -             'N.H.': 'New Hampshire',
5038 -             'N.J.': 'New Jersey',
5039 -             'N.M.': 'Nouveau-Mexique',
5040 -             'N.Y.': 'New York',
5041 -             'NB': 'Nebraska',
5042 -             'NC': 'Caroline du Nord',
5043 -             'ND': 'Dakota du Nord',
5044 -             'NE': 'Nebraska',
5045 -             'NH': 'New Hampshire',
5046 -             'NJ': 'New Jersey',
5047 -             'NM': 'Nouveau-Mexique',
5048 -             'NV': 'Nevada',
5049 -             'NY': 'New York',
5050 -             'Neb.': 'Nebraska',
5051 -             'Nebr.': 'Nebraska',
5052 -             'Nev.': 'Nevada',
5053 -             'New M.': 'Nouveau-Mexique',
5054 -             'NoDak': 'Dakota du Nord',
5055 -             'Nv.': 'Nevada',
5056 -             'O.': 'Ohio',
5057 -             'OH': 'Ohio',
5058 -             'OK': 'Oklahoma',
5059 -             'OR': 'Oregon',
5060 -             'Oh.': 'Ohio',
5061 -             'Ohio': 'Ohio',
5062 -             'Ok.': 'Oklahoma',
5063 -             'Okla.': 'Oklahoma',
5064 -             'Or.': 'Oregon',
5065 -             'Ore.': 'Oregon',
5066 -             'Oreg.': 'Oregon',
5067 -             'PA': 'Pennsylvanie',
5068 -             'Pa.': 'Pennsylvanie',
5069 -             'R.I.': 'Rhode Island',
5070 -             'R.I. & P.P.': 'Rhode Island',
5071 -             'RI': 'Rhode Island',
5072 -             'S. Car.': 'Caroline du Sud',
5073 -             'S. Dak.': 'Dakota du Sud',
5074 -             'S.C.': 'Caroline du Sud',
5075 -             'S.D.': 'Dakota du Sud',
5076 -             'SC': 'Caroline du Sud',
5077 -             'SD': 'Dakota du Sud',
5078 -             'SoDak': 'Dakota du Sud',
5079 -             'State': 'Utah',
5080 -             'TN': 'Tennessee',
5081 -             'TX': 'Texas',
5082 -             'Tenn.': 'Tennessee',
5083 -             'Tex.': 'Texas',
5084 -             'Texas': 'Texas',
5085 -             'Tn.': 'Tennessee',
5086 -             'Tx.': 'Texas',
5087 -             'US-AL': 'Alabama',
5088 -             'US-AR': 'Arkansas',
5089 -             'US-AZ': 'Arizona',
5090 -             'US-CA': 'Californie',
5091 -             'US-CO': 'Colorado',
5092 -             'US-CT': 'Connecticut',
5093 -             'US-DC': 'District of ColuFederal district',
5094 -             'US-DE': 'Delaware',
5095 -             'US-FL': 'Floride',
5096 -             'US-GA': u'Géorgie',
5097 -             'US-IL': 'Illinois',
5098 -             'US-IN': 'Indiana',
5099 -             'US-KY': 'Kentucky',
5100 -             'US-LA': 'Louisiane',
5101 -             'US-MA': 'Massachusetts',
5102 -             'US-MD': 'Maryland',
5103 -             'US-MI': 'Michigan',
5104 -             'US-MN': 'Minnesota',
5105 -             'US-MO': 'Missouri',
5106 -             'US-MS': 'Mississippi',
5107 -             'US-MT': 'Montana',
5108 -             'US-NC': 'Caroline du Nord',
5109 -             'US-ND': 'Dakota du Nord',
5110 -             'US-NE': 'Nebraska',
5111 -             'US-NH': 'New Hampshire',
5112 -             'US-NJ': 'New Jersey',
5113 -             'US-NM': 'Nouveau-Mexique',
5114 -             'US-NY': 'New York',
5115 -             'US-OK': 'Oklahoma',
5116 -             'US-PA': 'Pennsylvanie',
5117 -             'US-RI': 'Rhode Island',
5118 -             'US-SC': 'Caroline du Sud',
5119 -             'US-SD': 'Dakota du Sud',
5120 -             'US-TN': 'Tennessee',
5121 -             'US-VA': 'Virginia',
5122 -             'US-VT': 'Vermont',
5123 -             'US-WA': 'Washington',
5124 -             'US-WI': 'Wisconsin',
5125 -             'US-WV': 'Virginie occidentale',
5126 -             'US-WY': 'Wyoming',
5127 -             'UT': 'Utah',
5128 -             'Ut.': 'Utah',
5129 -             'Utah': 'Utah',
5130 -             'VA': 'Virginia',
5131 -             'VT': 'Vermont',
5132 -             'Va.': 'Virginia',
5133 -             'Vt.': 'Vermont',
5134 -             'W. Va.': 'Virginie occidentale',
5135 -             'W. Virg.': 'Virginie occidentale',
5136 -             'W.V.': 'Virginie occidentale',
5137 -             'W.Va.': 'Virginie occidentale',
5138 -             'WA': 'Washington',
5139 -             'WI': 'Wisconsin',
5140 -             'WN': 'Washington',
5141 -             'WS': 'Wisconsin',
5142 -             'WV': 'Virginie occidentale',
5143 -             'WY': 'Wyoming',
5144 -             'Wa.': 'Washington',
5145 -             'Wash.': 'Washington',
5146 -             'Wash. D.C.': 'District of ColuFederal district',
5147 -             'Wi.': 'Wisconsin',
5148 -             'Wis.': 'Wisconsin',
5149 -             'Wisc.': 'Wisconsin',
5150 -             'Wn.': 'Washington',
5151 -             'Wy.': 'Wyoming',
5152 -             'Wyo.': 'Wyoming'}
diff --git a/rl/__init__.py b/rl/__init__.py
diff --git a/rl/aligner.py b/rl/aligner.py
@@ -0,0 +1,324 @@
5153 +# -*- coding:utf-8 -*-
5154 +# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
5155 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
5156 +#
5157 +# This program is free software: you can redistribute it and/or modify it under
5158 +# the terms of the GNU Lesser General Public License as published by the Free
5159 +# Software Foundation, either version 2.1 of the License, or (at your option)
5160 +# any later version.
5161 +#
5162 +# This program is distributed in the hope that it will be useful, but WITHOUT
5163 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
5164 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
5165 +# details.
5166 +#
5167 +# You should have received a copy of the GNU Lesser General Public License along
5168 +# with this program. If not, see <http://www.gnu.org/licenses/>.
5169 +import time
5170 +import logging
5171 +from collections import defaultdict
5172 +
5173 +from scipy import zeros
5174 +from scipy.sparse import lil_matrix
5175 +
5176 +from nazca.utils.dataio import parsefile
5177 +
5178 +
5179 +###############################################################################
5180 +### UTILITY FUNCTIONS #########################################################
5181 +###############################################################################
5182 +def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
5183 +    """ Return the aligned pairs
5184 +    """
5185 +    if unique:
5186 +        for refid in global_matched:
5187 +            bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
5188 +            ref_record = refset[refid]
5189 +            target_record = targetset[bestid]
5190 +            distance = global_mat[refid, bestid] if global_mat is not None else None
5191 +            yield (ref_record[0], refid), (target_record[0], bestid), distance
5192 +    else:
5193 +        for refid in global_matched:
5194 +            for targetid, _ in global_matched[refid]:
5195 +                ref_record = refset[refid]
5196 +                target_record = targetset[targetid]
5197 +                distance = global_mat[refid, targetid] if global_mat is not None else None
5198 +                yield (ref_record[0], refid), (target_record[0], targetid), distance
5199 +
5200 +
5201 +###############################################################################
5202 +### BASE ALIGNER OBJECT #######################################################
5203 +###############################################################################
5204 +class BaseAligner(object):
5205 +
5206 +    def __init__(self, threshold, processings, normalize_matrix=False):
5207 +        self.threshold = threshold
5208 +        self.processings = processings
5209 +        self.normalize_matrix = normalize_matrix
5210 +        self.ref_normalizer = None
5211 +        self.target_normalizer = None
5212 +        self.target_normalizer = None
5213 +        self.blocking = None
5214 +        self.alignments_done = 0
5215 +        self.pairs_found = 0
5216 +        self.nb_comparisons = 0
5217 +        self.nb_blocks = 0
5218 +        self.refset_size = None
5219 +        self.targetset_size = None
5220 +        self.time = None
5221 +        self.logger = logging.getLogger('nazca.aligner')
5222 +
5223 +    def register_ref_normalizer(self, normalizer):
5224 +        """ Register normalizers to be applied
5225 +        before alignment """
5226 +        self.ref_normalizer = normalizer
5227 +
5228 +    def register_target_normalizer(self, normalizer):
5229 +        """ Register normalizers to be applied
5230 +        before alignment """
5231 +        self.target_normalizer = normalizer
5232 +
5233 +    def register_blocking(self, blocking):
5234 +        self.blocking = blocking
5235 +
5236 +    def apply_normalization(self, dataset, normalizer):
5237 +        if normalizer:
5238 +            return normalizer.normalize_dataset(dataset)
5239 +        return dataset
5240 +
5241 +    def compute_distance_matrix(self, refset, targetset,
5242 +                                ref_indexes, target_indexes):
5243 +        """ Compute and return the global alignment matrix.
5244 +        For each `processing` a `Distancematrix` is built, then all the
5245 +        matrices are summed with their own weighting and the result is the global
5246 +        alignment matrix, which is returned.
5247 +        """
5248 +        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
5249 +        for processing in self.processings:
5250 +            distmatrix += processing.cdist(refset, targetset,
5251 +                                          ref_indexes, target_indexes)
5252 +        return distmatrix
5253 +
5254 +    def threshold_matched(self, distmatrix):
5255 +        """ Return the matched elements within a dictionnary,
5256 +        each key being the indice from X, and the corresponding
5257 +        values being a list of couple (indice from Y, distance)
5258 +        """
5259 +        match = defaultdict(list)
5260 +        if self.normalize_matrix:
5261 +            distmatrix /= distmatrix.max()
5262 +        ind = (distmatrix <= self.threshold).nonzero()
5263 +        indrow = ind[0].tolist()
5264 +        indcol = ind[1].tolist()
5265 +        for (i, j) in zip(indrow, indcol):
5266 +            match[i].append((j, distmatrix[i, j]))
5267 +        return match
5268 +
5269 +    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
5270 +        # Build items
5271 +        items = []
5272 +        ref_indexes = ref_indexes or xrange(len(refset))
5273 +        target_indexes = target_indexes or xrange(len(targetset))
5274 +        # Apply alignments
5275 +        mat = self.compute_distance_matrix(refset, targetset,
5276 +                                           ref_indexes=ref_indexes,
5277 +                                           target_indexes=target_indexes)
5278 +        matched = self.threshold_matched(mat)
5279 +        # Reapply matched to global indexes
5280 +        new_matched = {}
5281 +        for k, values in matched.iteritems():
5282 +            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
5283 +        return mat, new_matched
5284 +
5285 +    def align(self, refset, targetset, get_matrix=True):
5286 +        """ Perform the alignment on the referenceset
5287 +        and the targetset
5288 +        """
5289 +        start_time = time.time()
5290 +        refset = self.apply_normalization(refset, self.ref_normalizer)
5291 +        targetset = self.apply_normalization(targetset, self.target_normalizer)
5292 +        self.refset_size = len(refset)
5293 +        self.targetset_size = len(targetset)
5294 +        # If no blocking
5295 +        if not self.blocking:
5296 +            return self._get_match(refset, targetset)
5297 +        # Blocking == conquer_and_divide
5298 +        global_matched = {}
5299 +        global_mat = lil_matrix((len(refset), len(targetset)))
5300 +        self.blocking.fit(refset, targetset)
5301 +        for refblock, targetblock in self.blocking.iter_blocks():
5302 +            self.nb_blocks += 1
5303 +            ref_index = [r[0] for r in refblock]
5304 +            target_index = [r[0] for r in targetblock]
5305 +            self.nb_comparisons += len(ref_index)*len(target_index)
5306 +            _, matched = self._get_match(refset, targetset, ref_index, target_index)
5307 +            for k, values in matched.iteritems():
5308 +                subdict = global_matched.setdefault(k, set())
5309 +                for v, d in values:
5310 +                    subdict.add((v, d))
5311 +                    self.alignments_done += 1
5312 +                    if get_matrix:
5313 +                        # XXX avoid issue in sparse matrix
5314 +                        global_mat[k, v] = d or 10**(-10)
5315 +        self.time = time.time() - start_time
5316 +        return global_mat, global_matched
5317 +
5318 +    def get_aligned_pairs(self, refset, targetset, unique=True):
5319 +        """ Get the pairs of aligned elements
5320 +        """
5321 +        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
5322 +        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
5323 +            self.pairs_found += 1
5324 +            yield pair
5325 +        self.log_infos()
5326 +
5327 +    def align_from_files(self, reffile, targetfile,
5328 +                         ref_indexes=None, target_indexes=None,
5329 +                         ref_encoding=None, target_encoding=None,
5330 +                         ref_separator='\t', target_separator='\t',
5331 +                         get_matrix=True):
5332 +        """ Align data from files
5333 +
5334 +        Parameters
5335 +        ----------
5336 +
5337 +        reffile: name of the reference file
5338 +
5339 +        targetfile: name of the target file
5340 +
5341 +        ref_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
5342 +                      be used to read the files.
5343 +
5344 +        target_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
5345 +                         be used to read the files.
5346 +
5347 +        ref_separator: separator of the reference file
5348 +
5349 +        target_separator: separator of the target file
5350 +        """
5351 +        refset = parsefile(reffile, indexes=ref_indexes,
5352 +                           encoding=ref_encoding, delimiter=ref_separator)
5353 +        targetset = parsefile(targetfile, indexes=target_indexes,
5354 +                              encoding=target_encoding, delimiter=target_separator)
5355 +        return self.align(refset, targetset, get_matrix=get_matrix)
5356 +
5357 +    def get_aligned_pairs_from_files(self, reffile, targetfile,
5358 +                         ref_indexes=None, target_indexes=None,
5359 +                         ref_encoding=None, target_encoding=None,
5360 +                         ref_separator='\t', target_separator='\t',
5361 +                         unique=True):
5362 +        """ Get the pairs of aligned elements
5363 +        """
5364 +        refset = parsefile(reffile, indexes=ref_indexes,
5365 +                           encoding=ref_encoding, delimiter=ref_separator)
5366 +        targetset = parsefile(targetfile, indexes=target_indexes,
5367 +                              encoding=target_encoding, delimiter=target_separator)
5368 +        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
5369 +        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
5370 +            yield pair
5371 +
5372 +    def log_infos(self):
5373 +        """ Display some info on the aligner process
5374 +        """
5375 +        self.logger.info('Computation time : %s' % self.time)
5376 +        self.logger.info('Size reference set : %s' % self.refset_size)
5377 +        self.logger.info('Size target set : %s' % self.targetset_size)
5378 +        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
5379 +        self.logger.info('Alignments done : %s' % self.alignments_done)
5380 +        self.logger.info('Pairs found : %s' % self.pairs_found)
5381 +        self.logger.info('Ratio reference set/alignments done : %s'
5382 +                         % (self.alignments_done/float(self.refset_size)))
5383 +        self.logger.info('Ratio target set/alignments done : %s'
5384 +                         % (self.alignments_done/float(self.targetset_size)))
5385 +        self.logger.info('Ratio reference set/pairs found : %s'
5386 +                         % (self.pairs_found/float(self.refset_size)))
5387 +        self.logger.info('Ratio target set/pairs found : %s'
5388 +                         % (self.pairs_found/float(self.targetset_size)))
5389 +        self.logger.info('Maximum comparisons : %s'
5390 +                         % (self.refset_size * self.targetset_size))
5391 +        self.logger.info('Number of blocks : %s' % self.nb_blocks)
5392 +        if self.nb_blocks:
5393 +            self.logger.info('Ratio comparisons/block : %s'
5394 +                             % (float(self.nb_comparisons)/self.nb_blocks))
5395 +        self.logger.info('Blocking reduction : %s'
5396 +                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
5397 +
5398 +
5399 +###############################################################################
5400 +### PIPELINE ALIGNER OBJECT ##################################################
5401 +###############################################################################
5402 +class PipelineAligner(object):
5403 +    """ This pipeline will perform iterative alignments, removing each time
5404 +    the aligned results from the previous aligner.
5405 +    """
5406 +
5407 +    def __init__(self, aligners):
5408 +        self.aligners = aligners
5409 +        self.pairs = {}
5410 +        self.nb_comparisons = 0
5411 +        self.nb_blocks = 0
5412 +        self.alignments_done = 0
5413 +        self.pairs_found = 0
5414 +        self.refset_size = None
5415 +        self.targetset_size = None
5416 +        self.time = None
5417 +        self.logger = logging.getLogger('nazca.aligner')
5418 +
5419 +    def get_aligned_pairs(self, refset, targetset, unique=True):
5420 +        """ Get the pairs of aligned elements
5421 +        """
5422 +        start_time = time.time()
5423 +        ref_index = range(len(refset))
5424 +        target_index = range(len(targetset))
5425 +        self.refset_size = len(refset)
5426 +        self.targetset_size = len(targetset)
5427 +        global_matched = {}
5428 +        global_mat = lil_matrix((len(refset), len(targetset)))
5429 +        seen_refset = set()
5430 +        # Iteration over aligners
5431 +        for ind_aligner, aligner in enumerate(self.aligners):
5432 +            # Perform alignment
5433 +            _refset = [refset[i] for i in ref_index]
5434 +            _targetset = [targetset[i] for i in target_index]
5435 +            for pair in aligner.get_aligned_pairs(_refset, _targetset, unique):
5436 +                self.pairs_found += 1
5437 +                pair = ((pair[0][0], ref_index[pair[0][1]]),
5438 +                        (pair[1][0], target_index[pair[1][1]]))
5439 +                yield pair
5440 +                seen_refset.add(pair[0][1])
5441 +            # Store stats
5442 +            self.nb_blocks += aligner.nb_blocks
5443 +            self.nb_comparisons += aligner.nb_comparisons
5444 +            # Update indexes if necessary
5445 +            # For now, we remove all the reference set that are already matched
5446 +            if ind_aligner < len(self.aligners) - 1:
5447 +                # There are other aligners after this one
5448 +                ref_index = [i for i in ref_index if i not in seen_refset]
5449 +        self.time = time.time() - start_time
5450 +        self.log_infos()
5451 +
5452 +    def log_infos(self):
5453 +        """ Display some info on the aligner process
5454 +        """
5455 +        self.logger.info('Computation time : %s' % self.time)
5456 +        self.logger.info('Size reference set : %s' % self.refset_size)
5457 +        self.logger.info('Size target set : %s' % self.targetset_size)
5458 +        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
5459 +        self.logger.info('Alignments done : %s' % self.alignments_done)
5460 +        self.logger.info('Pairs found : %s' % self.pairs_found)
5461 +        self.logger.info('Ratio reference set/alignments done : %s'
5462 +                         % (self.alignments_done/float(self.refset_size)))
5463 +        self.logger.info('Ratio target set/alignments done : %s'
5464 +                         % (self.alignments_done/float(self.targetset_size)))
5465 +        self.logger.info('Ratio reference set/pairs found : %s'
5466 +                         % (self.pairs_found/float(self.refset_size)))
5467 +        self.logger.info('Ratio target set/pairs found : %s'
5468 +                         % (self.pairs_found/float(self.targetset_size)))
5469 +        self.logger.info('Maximum comparisons : %s'
5470 +                         % (self.refset_size * self.targetset_size))
5471 +        self.logger.info('Number of blocks : %s' % self.nb_blocks)
5472 +        if self.nb_blocks:
5473 +            self.logger.info('Ratio comparisons/block : %s'
5474 +                             % (float(self.nb_comparisons)/self.nb_blocks))
5475 +        self.logger.info('Blocking reduction : %s'
5476 +                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
diff --git a/rl/blocking.py b/rl/blocking.py
@@ -0,0 +1,666 @@
5477 +# -*- coding:utf-8 -*-
5478 +# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
5479 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
5480 +#
5481 +# This program is free software: you can redistribute it and/or modify it under
5482 +# the terms of the GNU Lesser General Public License as published by the Free
5483 +# Software Foundation, either version 2.1 of the License, or (at your option)
5484 +# any later version.
5485 +#
5486 +# This program is distributed in the hope that it will be useful, but WITHOUT
5487 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
5488 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
5489 +# details.
5490 +#
5491 +# You should have received a copy of the GNU Lesser General Public License along
5492 +# with this program. If not, see <http://www.gnu.org/licenses/>.
5493 +
5494 +
5495 +""" Blocking techniques.
5496 +
5497 +This module implements a set of blocking techniques used to split
5498 +datasets in smaller subsets that will be aligned in more details.
5499 +
5500 +Additional information:
5501 +
5502 +   P. Christen, Data Matching, Data-Centric Systems and Applications,
5503 +
5504 +
5505 +"""
5506 +from functools import partial
5507 +import warnings
5508 +
5509 +from scipy.spatial import KDTree
5510 +
5511 +from nazca.utils.minhashing import Minlsh
5512 +from nazca.utils.distances import soundexcode
5513 +
5514 +
5515 +###############################################################################
5516 +### GENERAL BLOCKING ##########################################################
5517 +###############################################################################
5518 +class BaseBlocking(object):
5519 +    """ An abstract general blocking object that exposes
5520 +    the API that should be common to all blockings object
5521 +    """
5522 +    def __init__(self, ref_attr_index, target_attr_index):
5523 +        """ Build the blocking object
5524 +
5525 +        Parameters
5526 +        ----------
5527 +
5528 +        ref_attr_index: index of the attribute of interest in a record
5529 +                        for the reference dataset
5530 +                        (i.e. attribute to be used for key computation)
5531 +
5532 +        target_attr_index: index of the attribute of interest in a record
5533 +                           for the target dataset
5534 +                           (i.e. attribute to be used for key computation)
5535 +        """
5536 +        self.ref_attr_index = ref_attr_index
5537 +        self.target_attr_index = target_attr_index
5538 +        self.refids = None
5539 +        self.targetids = None
5540 +        self.is_fitted = False
5541 +
5542 +    def _fit(self, refset, targetset):
5543 +        raise NotImplementedError
5544 +
5545 +    def _iter_blocks(self):
5546 +        """ Internal iteration function over blocks
5547 +        """
5548 +        raise NotImplementedError
5549 +
5550 +    def _cleanup(self):
5551 +        """ Internal cleanup blocking for further use (e.g. in pipeline)
5552 +        """
5553 +        raise NotImplementedError
5554 +
5555 +    def fit(self, refset, targetset):
5556 +        """ Fit the blocking technique on the reference and target datasets
5557 +
5558 +        Parameters
5559 +        ----------
5560 +        refset: a dataset (list of records)
5561 +
5562 +        targetset: a dataset (list of records)
5563 +        """
5564 +        self._fit(refset, targetset)
5565 +        # Keep ids for blocks building
5566 +        self.refids = [(i, r[0]) for i, r in enumerate(refset)]
5567 +        self.targetids = [(i, r[0]) for i, r in enumerate(targetset)]
5568 +        self.is_fitted = True
5569 +
5570 +    def iter_blocks(self):
5571 +        """ Iterator over the different possible blocks.
5572 +
5573 +        Returns
5574 +        -------
5575 +
5576 +        (block1, block2): The blocks are always (reference_block, target_block)
5577 +                          and contains the pair (index, id) of the record in the
5578 +                          corresponding dataset.
5579 +        """
5580 +        assert self.is_fitted
5581 +        return self._iter_blocks()
5582 +
5583 +    def iter_indice_blocks(self):
5584 +        """ Iterator over the different possible blocks.
5585 +
5586 +        Returns
5587 +        -------
5588 +
5589 +        (block1, block2): The blocks are always (reference_block, target_block)
5590 +                          and contains the indexes of the record in the
5591 +                          corresponding dataset.
5592 +        """
5593 +        assert self.is_fitted
5594 +        for block1, block2 in self._iter_blocks():
5595 +            yield [r[0] for r in block1], [r[0] for r in block2]
5596 +
5597 +    def iter_id_blocks(self):
5598 +        """ Iterator over the different possible blocks.
5599 +
5600 +        Returns
5601 +        -------
5602 +
5603 +        (block1, block2): The blocks are always (reference_block, target_block)
5604 +                          and contains the ids of the record in the
5605 +                          corresponding dataset.
5606 +        """
5607 +        assert self.is_fitted
5608 +        for block1, block2 in self._iter_blocks():
5609 +            yield [r[1] for r in block1], [r[1] for r in block2]
5610 +
5611 +    def iter_pairs(self):
5612 +        """ Iterator over the different possible pairs.
5613 +
5614 +        Returns
5615 +        -------
5616 +
5617 +        (pair1, pari2): The pairs are always ((ind_reference, id_reference),
5618 +                                              (ind_target, id_target))
5619 +                        and are the ids of the record in the corresponding dataset.
5620 +        """
5621 +        assert self.is_fitted
5622 +        for block1, block2 in self.iter_blocks():
5623 +            for val1 in block1:
5624 +                for val2 in block2:
5625 +                    yield val1, val2
5626 +
5627 +    def iter_indice_pairs(self):
5628 +        """ Iterator over the different possible pairs.
5629 +
5630 +        Returns
5631 +        -------
5632 +
5633 +        (pair1, pari2): The pairs are always (ind_reference, ind_target)
5634 +                        and are the ids of the record in the corresponding dataset.
5635 +        """
5636 +        assert self.is_fitted
5637 +        for block1, block2 in self.iter_indice_blocks():
5638 +            for val1 in block1:
5639 +                for val2 in block2:
5640 +                    yield val1, val2
5641 +
5642 +    def iter_id_pairs(self):
5643 +        """ Iterator over the different possible pairs.
5644 +
5645 +        Returns
5646 +        -------
5647 +
5648 +        (pair1, pari2): The pairs are always (id_reference, id_target)
5649 +                        and are the ids of the record in the corresponding dataset.
5650 +        """
5651 +        assert self.is_fitted
5652 +        for block1, block2 in self.iter_id_blocks():
5653 +            for val1 in block1:
5654 +                for val2 in block2:
5655 +                    yield val1, val2
5656 +
5657 +    def cleanup(self):
5658 +        """ Cleanup blocking for further use (e.g. in pipeline)
5659 +        """
5660 +        self.is_fitted = True
5661 +        self._cleanup()
5662 +
5663 +
5664 +###############################################################################
5665 +### KEY BLOCKING ##############################################################
5666 +###############################################################################
5667 +class KeyBlocking(BaseBlocking):
5668 +    """ This blocking technique is based on a a blocking criteria
5669 +    (or blocking key), that will be used to divide the datasets.
5670 +
5671 +    The main idea here is:
5672 +
5673 +    1 - to create an index of f(x) for each x in the reference set.
5674 +
5675 +    2 - to create an index of f(y) for each y in the target set.
5676 +
5677 +    3 - to iterate on each distinct value of f(x) and to return
5678 +        the identifiers of the records of the both sets for this value.
5679 +    """
5680 +
5681 +    def __init__(self, ref_attr_index, target_attr_index, callback, ignore_none=False):
5682 +        super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
5683 +        self.callback = callback
5684 +        self.ignore_none = ignore_none
5685 +        self.reference_index = {}
5686 +        self.target_index = {}
5687 +
5688 +    def _fit(self, refset, targetset):
5689 +        """ Fit a dataset in an index using the callback
5690 +        """
5691 +        for ind, rec in enumerate(refset):
5692 +            key = self.callback(rec[self.ref_attr_index])
5693 +            if not key and self.ignore_none:
5694 +                continue
5695 +            self.reference_index.setdefault(key, []).append((ind, rec[0]))
5696 +        for ind, rec in enumerate(targetset):
5697 +            key = self.callback(rec[self.target_attr_index])
5698 +            if not key and self.ignore_none:
5699 +                continue
5700 +            self.target_index.setdefault(key, []).append((ind, rec[0]))
5701 +
5702 +    def _iter_blocks(self):
5703 +        """ Iterator over the different possible blocks.
5704 +
5705 +        Returns
5706 +        -------
5707 +
5708 +        (block1, block2): The blocks are always (reference_block, target_block)
5709 +                          and containts the indexes of the record in the
5710 +                          corresponding dataset.
5711 +        """
5712 +        for key, block1 in self.reference_index.iteritems():
5713 +            block2 = self.target_index.get(key)
5714 +            if block1 and block2:
5715 +                yield (block1, block2)
5716 +
5717 +    def _cleanup(self):
5718 +        """ Cleanup blocking for further use (e.g. in pipeline)
5719 +        """
5720 +        self.reference_index = {}
5721 +        self.target_index = {}
5722 +
5723 +
5724 +class SoundexBlocking(KeyBlocking):
5725 +
5726 +    def __init__(self, ref_attr_index, target_attr_index, language='french',):
5727 +        super(SoundexBlocking, self).__init__(ref_attr_index, target_attr_index,
5728 +                                              partial(soundexcode, language=language))
5729 +
5730 +
5731 +###############################################################################
5732 +### BIGRAM BLOCKING ###########################################################
5733 +###############################################################################
5734 +class NGramBlocking(BaseBlocking):
5735 +    """ This blocking technique is based on a a n-gram key.
5736 +    """
5737 +
5738 +    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
5739 +        super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
5740 +        self.ngram_size = ngram_size
5741 +        self.depth = depth
5742 +        self.reference_index = {}
5743 +        self.target_index = {}
5744 +
5745 +    def _fit_dataset(self, dataset, cur_index, attr_index):
5746 +        """ Fit a dataset
5747 +        """
5748 +        for ind, r in enumerate(dataset):
5749 +            cur_dict = cur_index
5750 +            text = r[attr_index]
5751 +            for i in range(self.depth):
5752 +                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
5753 +                if i < self.depth - 1:
5754 +                    cur_dict = cur_dict.setdefault(ngram, {})
5755 +            cur_dict.setdefault(ngram, []).append((ind, r[0]))
5756 +
5757 +    def _fit(self, refset, targetset):
5758 +        """ Fit the two sets (reference set and target set)
5759 +        """
5760 +        self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
5761 +        self._fit_dataset(targetset, self.target_index, self.target_attr_index)
5762 +
5763 +    def _iter_dict(self, ref_cur_dict, target_cur_dict):
5764 +        """ Iterative function used to create blocks from dicts
5765 +        """
5766 +        for key, sub_dict in ref_cur_dict.iteritems():
5767 +            if key in target_cur_dict:
5768 +                if isinstance(sub_dict, dict):
5769 +                    # There is another dict layer
5770 +                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
5771 +                        yield block1, block2
5772 +                else:
5773 +                    # This is a list
5774 +                    yield sub_dict, target_cur_dict[key]
5775 +
5776 +    def _iter_blocks(self):
5777 +        """ Iterator over the different possible blocks.
5778 +
5779 +        Returns
5780 +        -------
5781 +
5782 +        (block1, block2): The blocks are always (reference_block, target_block)
5783 +                          and containts the indexes of the record in the
5784 +                          corresponding dataset.
5785 +        """
5786 +        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
5787 +            if block1 and block2:
5788 +                yield block1, block2
5789 +
5790 +    def _cleanup(self):
5791 +        """ Cleanup blocking for further use (e.g. in pipeline)
5792 +        """
5793 +        self.reference_index = {}
5794 +        self.target_index = {}
5795 +
5796 +
5797 +###############################################################################
5798 +### SORTKEY BLOCKING ##########################################################
5799 +###############################################################################
5800 +class SortedNeighborhoodBlocking(BaseBlocking):
5801 +    """ This blocking technique is based on a a sorting blocking criteria
5802 +    (or blocking key), that will be used to divide the datasets.
5803 +    """
5804 +
5805 +    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
5806 +        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
5807 +        self.key_func = key_func
5808 +        self.window_width = window_width
5809 +        self.sorted_dataset = None
5810 +
5811 +    def _fit(self, refset, targetset):
5812 +        """ Fit a dataset in an index using the callback
5813 +        """
5814 +        self.sorted_dataset = [((ind, r[0]), r[self.ref_attr_index], 0)
5815 +                               for ind, r in enumerate(refset)]
5816 +        self.sorted_dataset.extend([((ind, r[0]), r[self.target_attr_index], 1)
5817 +                                    for ind, r in enumerate(targetset)])
5818 +        self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
5819 +
5820 +    def _iter_blocks(self):
5821 +        """ Iterator over the different possible blocks.
5822 +        """
5823 +        for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
5824 +            # Only keep reference set record
5825 +            if dset == 1:
5826 +                continue
5827 +            block1 = [rid,]
5828 +            minind = (ind - self.window_width)
5829 +            minind = minind if minind >=0 else 0
5830 +            maxind = (ind + self.window_width + 1)
5831 +            block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
5832 +                      if d == 1]
5833 +            if block1 and block2:
5834 +                yield (block1, block2)
5835 +
5836 +    def _cleanup(self):
5837 +        """ Cleanup blocking for further use (e.g. in pipeline)
5838 +        """
5839 +        self.sorted_dataset = None
5840 +
5841 +
5842 +###############################################################################
5843 +### MERGE BLOCKING ############################################################
5844 +###############################################################################
5845 +class MergeBlocking(BaseBlocking):
5846 +    """ This blocking technique keep only one appearance of one given values,
5847 +    and removes all the other records having this value.
5848 +    The merge is based on a score function
5849 +
5850 +    E.g.
5851 +      ('http://fr.wikipedia.org/wiki/Paris_%28Texas%29', 'Paris', 25898)
5852 +      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
5853 +
5854 +    could be (with a score function based on the population (third value):
5855 +
5856 +      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
5857 +
5858 +    !!! WARNING !!! This is only done on ONE set (the one with a non null attr index)
5859 +    """
5860 +
5861 +    def __init__(self, ref_attr_index, target_attr_index, score_func):
5862 +        super(MergeBlocking, self).__init__(ref_attr_index, target_attr_index)
5863 +        self.score_func = score_func
5864 +        self.merged_dataset = None
5865 +        self.other_dataset = None
5866 +        if ref_attr_index is None and target_attr_index is None:
5867 +            raise ValueError('At least one of ref_attr_index or target_attr_index '
5868 +                             'should not be None')
5869 +
5870 +    def _fit(self, refset, targetset):
5871 +        """ Fit a dataset in an index using the callback
5872 +        """
5873 +        if self.ref_attr_index is not None:
5874 +            # Merge refset
5875 +            self.merged_dataset = self._merge_dataset(refset, self.ref_attr_index)
5876 +            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(targetset)]
5877 +        else:
5878 +            # Merge targetset
5879 +            self.merged_dataset = self._merge_dataset(targetset, self.target_attr_index)
5880 +            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(refset)]
5881 +
5882 +    def _merge_dataset(self, dataset, attr_index):
5883 +        """ Merge a dataset
5884 +        """
5885 +        merged_dataset_dict = {}
5886 +        for ind, record in enumerate(dataset):
5887 +            score = self.score_func(record)
5888 +            if record[attr_index] not in merged_dataset_dict:
5889 +                # Create new entry
5890 +                merged_dataset_dict[record[attr_index]] = (ind, record, score)
5891 +            elif (record[attr_index] in merged_dataset_dict
5892 +                  and merged_dataset_dict[record[attr_index]][2] < score):
5893 +                # Change current score
5894 +                merged_dataset_dict[record[attr_index]] = (ind, record, score)
5895 +        return [(ind, r[0]) for ind, r, score in merged_dataset_dict.itervalues()]
5896 +
5897 +    def _iter_blocks(self):
5898 +        """ Iterator over the different possible blocks.
5899 +        """
5900 +        if self.ref_attr_index is not None:
5901 +            yield self.merged_dataset, self.other_dataset
5902 +        else:
5903 +            # self.target_attr_index is not None
5904 +            yield self.other_dataset, self.merged_dataset
5905 +
5906 +    def _cleanup(self):
5907 +        """ Cleanup blocking for further use (e.g. in pipeline)
5908 +        """
5909 +        self.merged_dataset = None
5910 +        self.other_dataset = None
5911 +
5912 +
5913 +###############################################################################
5914 +### CLUSTERING-BASED BLOCKINGS ################################################
5915 +###############################################################################
5916 +class KmeansBlocking(BaseBlocking):
5917 +    """ A blocking technique based on Kmeans
5918 +    """
5919 +
5920 +    def __init__(self, ref_attr_index, target_attr_index, n_clusters=None):
5921 +        super(KmeansBlocking, self).__init__(ref_attr_index, target_attr_index)
5922 +        self.n_clusters = n_clusters
5923 +        self.kmeans = None
5924 +        self.predicted = None
5925 +        from sklearn import cluster
5926 +        self.cluster_class = cluster.KMeans
5927 +
5928 +    def _fit(self, refset, targetset):
5929 +        """ Fit the reference dataset.
5930 +        """
5931 +        # If an element is None (missing), use instead the identity element.
5932 +        # The identity element is defined as the 0-vector
5933 +        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
5934 +        # We assume here that there are at least 2 elements in the refset
5935 +        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
5936 +        kmeans =  self.cluster_class(n_clusters=n_clusters)
5937 +        kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
5938 +        self.kmeans = kmeans
5939 +        # Predict on targetset
5940 +        self.predicted = self.kmeans.predict([elt[self.target_attr_index]
5941 +                                              or idelement for elt in targetset])
5942 +
5943 +    def _iter_blocks(self):
5944 +        """ Iterator over the different possible blocks.
5945 +
5946 +        Returns
5947 +        -------
5948 +
5949 +        (block1, block2): The blocks are always (reference_block, target_block)
5950 +                          and containts the indexes of the record in the
5951 +                          corresponding dataset.
5952 +        """
5953 +        neighbours = [[[], []] for _ in xrange(self.kmeans.n_clusters)]
5954 +        for ind, li in enumerate(self.predicted):
5955 +            neighbours[li][1].append(self.targetids[ind])
5956 +        for ind, li in enumerate(self.kmeans.labels_):
5957 +            neighbours[li][0].append(self.refids[ind])
5958 +        for block1, block2 in neighbours:
5959 +            if len(block1) and len(block2):
5960 +                yield block1, block2
5961 +
5962 +    def _cleanup(self):
5963 +        """ Cleanup blocking for further use (e.g. in pipeline)
5964 +        """
5965 +        self.kmeans = None
5966 +        self.predicted = None
5967 +
5968 +
5969 +###############################################################################
5970 +### KDTREE BLOCKINGS ##########################################################
5971 +###############################################################################
5972 +class KdTreeBlocking(BaseBlocking):
5973 +    """ A blocking technique based on KdTree
5974 +    """
5975 +    def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
5976 +        super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
5977 +        self.threshold = threshold
5978 +        self.reftree = None
5979 +        self.targettree = None
5980 +        self.nb_elements = None
5981 +
5982 +    def _fit(self, refset, targetset):
5983 +        """ Fit the blocking
5984 +        """
5985 +        firstelement = refset[0][self.ref_attr_index]
5986 +        self.nb_elements = len(refset)
5987 +        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
5988 +        idelement = (0,) * idsize
5989 +        # KDTree is expecting a two-dimensional array
5990 +        if idsize == 1:
5991 +            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
5992 +            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
5993 +        else:
5994 +            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
5995 +            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
5996 +
5997 +    def _iter_blocks(self):
5998 +        """ Iterator over the different possible blocks.
5999 +
6000 +        Returns
6001 +        -------
6002 +
6003 +        (block1, block2): The blocks are always (reference_block, target_block)
6004 +                          and containts the indexes of the record in the
6005 +                          corresponding dataset.
6006 +        """
6007 +        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
6008 +        neighbours = []
6009 +        for ind in xrange(self.nb_elements):
6010 +            if not extraneighbours[ind]:
6011 +                continue
6012 +            _ref = [self.refids[ind],]
6013 +            _target = [self.targetids[v] for v in extraneighbours[ind]]
6014 +            neighbours.append((_ref, _target))
6015 +        for block1, block2 in neighbours:
6016 +            if len(block1) and len(block2):
6017 +                yield block1, block2
6018 +
6019 +    def _cleanup(self):
6020 +        """ Cleanup blocking for further use (e.g. in pipeline)
6021 +        """
6022 +        self.reftree = None
6023 +        self.targettree = None
6024 +        self.nb_elements = None
6025 +
6026 +
6027 +###############################################################################
6028 +### MINHASHING BLOCKINGS ######################################################
6029 +###############################################################################
6030 +class MinHashingBlocking(BaseBlocking):
6031 +    """ A blocking technique based on MinHashing
6032 +    """
6033 +    def __init__(self, ref_attr_index, target_attr_index,
6034 +                 threshold=0.1, kwordsgram=1, siglen=200):
6035 +        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
6036 +        self.threshold = threshold
6037 +        self.kwordsgram = kwordsgram
6038 +        self.siglen = siglen
6039 +        self.minhasher = Minlsh()
6040 +        self.nb_elements = None
6041 +
6042 +    def _fit(self, refset, targetset):
6043 +        """ Find the blocking using minhashing
6044 +        """
6045 +        # If an element is None (missing), use instead the identity element.
6046 +        idelement = ''
6047 +        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
6048 +                        [elt[self.target_attr_index] or idelement for elt in targetset],
6049 +                        self.kwordsgram, self.siglen)
6050 +        self.nb_elements = len(refset)
6051 +
6052 +    def _iter_blocks(self):
6053 +        """ Iterator over the different possible blocks.
6054 +
6055 +        Returns
6056 +        -------
6057 +
6058 +        (block1, block2): The blocks are always (reference_block, target_block)
6059 +                          and containts the indexes of the record in the
6060 +                          corresponding dataset.
6061 +        """
6062 +        rawneighbours = self.minhasher.predict(self.threshold)
6063 +        neighbours = []
6064 +        for data in rawneighbours:
6065 +            neighbours.append([[], []])
6066 +            for i in data:
6067 +                if i >= self.nb_elements:
6068 +                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
6069 +                else:
6070 +                    neighbours[-1][0].append(self.refids[i])
6071 +            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
6072 +                neighbours.pop()
6073 +        for block1, block2 in neighbours:
6074 +            if len(block1) and len(block2):
6075 +                yield block1, block2
6076 +
6077 +    def _cleanup(self):
6078 +        """ Cleanup blocking for further use (e.g. in pipeline)
6079 +        """
6080 +        self.minhasher = Minlsh()
6081 +        self.nb_elements = None
6082 +
6083 +
6084 +###############################################################################
6085 +### BLOCKING PIPELINE #########################################################
6086 +###############################################################################
6087 +class PipelineBlocking(BaseBlocking):
6088 +    """ Pipeline multiple blocking techniques
6089 +    """
6090 +
6091 +    def __init__(self, blockings, collect_stats=False):
6092 +        """ Build the blocking object
6093 +
6094 +        Parameters
6095 +        ----------
6096 +
6097 +        blockings: ordered list of blocking objects
6098 +        """
6099 +        self.blockings = blockings
6100 +        self.stored_blocks = []
6101 +        self.collect_stats = collect_stats
6102 +        self.stats = {}
6103 +
6104 +    def _fit(self, refset, targetset):
6105 +        """ Internal fit of the pipeline """
6106 +        self._recursive_fit(refset, targetset, range(len(refset)), range(len(targetset)), 0)
6107 +
6108 +    def _recursive_fit(self, refset, targetset, ref_index, target_index, ind):
6109 +        """ Recursive fit of the blockings.
6110 +        Blocks are stored in the stored_blocks attribute.
6111 +        """
6112 +        if ind < len(self.blockings) - 1:
6113 +            # There are other blockings after this one
6114 +            blocking = self.blockings[ind]
6115 +            blocking.cleanup()
6116 +            blocking.fit([refset[i] for i in ref_index],
6117 +                         [targetset[i] for i in target_index])
6118 +            for block1, block2 in blocking.iter_indice_blocks():
6119 +                ind_block1 = [ref_index[i] for i in block1]
6120 +                ind_block2 = [target_index[i] for i in block2]
6121 +                if self.collect_stats:
6122 +                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
6123 +                self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
6124 +        else:
6125 +            # This is the final blocking
6126 +            blocking = self.blockings[ind]
6127 +            blocking.cleanup()
6128 +            blocking.fit([refset[i] for i in ref_index],
6129 +                         [targetset[i] for i in target_index])
6130 +            for block1, block2 in blocking.iter_blocks():
6131 +                ind_block1 = [(ref_index[i], _id) for i, _id in block1]
6132 +                ind_block2 = [(target_index[i], _id) for i, _id in block2]
6133 +                if self.collect_stats:
6134 +                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
6135 +                self.stored_blocks.append((ind_block1, ind_block2))
6136 +
6137 +    def _iter_blocks(self):
6138 +        """ Internal iteration function over blocks
6139 +        """
6140 +        for block1, block2 in self.stored_blocks:
6141 +            if block1 and block2:
6142 +                yield block1, block2
diff --git a/test/test_alignment.py b/test/test_alignment.py
@@ -20,12 +20,12 @@
6143  import random
6144  random.seed(6) ### Make sure tests are repeatable
6145  from os import path
6146 
6147  from nazca.utils.normalize import simplify
6148 -import nazca.record_linkage.aligner as alig
6149 -import nazca.record_linkage.blocking as blo
6150 +import nazca.rl.aligner as alig
6151 +import nazca.rl.blocking as blo
6152  from nazca.utils.distances import LevenshteinProcessing, GeographicalProcessing
6153 
6154 
6155  TESTDIR = path.dirname(__file__)
6156 
diff --git a/test/test_blocking.py b/test/test_blocking.py
@@ -21,15 +21,15 @@
6157  import random
6158  random.seed(6) ### Make sure tests are repeatable / Minhashing
6159 
6160  from nazca.utils.distances import (levenshtein, soundex, soundexcode,   \
6161                                         jaccard, euclidean, geographical)
6162 -from nazca.record_linkage.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
6163 -                                           MergeBlocking,
6164 -                                           NGramBlocking, PipelineBlocking,
6165 -                                           SoundexBlocking, KmeansBlocking,
6166 -                                           MinHashingBlocking, KdTreeBlocking)
6167 +from nazca.rl.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
6168 +                               MergeBlocking,
6169 +                               NGramBlocking, PipelineBlocking,
6170 +                               SoundexBlocking, KmeansBlocking,
6171 +                               MinHashingBlocking, KdTreeBlocking)
6172  from nazca.utils.normalize import SimplifyNormalizer, loadlemmas
6173 
6174 
6175  TESTDIR = path.dirname(__file__)
6176 
diff --git a/test/test_dataio.py b/test/test_dataio.py
@@ -23,12 +23,12 @@
6177  from tempfile import mkdtemp
6178 
6179  from nazca.utils.dataio import (HTMLPrettyPrint, ValidXHTMLPrettyPrint,
6180                                  sparqlquery, rqlquery, parsefile,
6181                                  autocast, split_file)
6182 -from nazca.named_entities import NerProcess
6183 -from nazca.named_entities.sources import NerSourceLexicon
6184 +from nazca.ner import NerProcess
6185 +from nazca.ner.sources import NerSourceLexicon
6186 
6187  TESTDIR = path.dirname(__file__)
6188 
6189  @contextmanager
6190  def tempdir():
diff --git a/test/test_filters.py b/test/test_filters.py
@@ -15,12 +15,15 @@
6191  #
6192  # You should have received a copy of the GNU Lesser General Public License along
6193  # with this program. If not, see <http://www.gnu.org/licenses/>.
6194  import unittest2
6195 
6196 -from nazca.named_entities import named_entities as core, filters
6197 -from nazca.named_entities.sources import NerSourceLexicon
6198 +from nazca.ner import NerProcess
6199 +from nazca.ner.filters import (NerOccurenceFilter,
6200 +                               NerReplacementRulesFilter,
6201 +                               NerDisambiguationWordParts)
6202 +from nazca.ner.sources import NerSourceLexicon
6203  from nazca.utils.tokenizer import Token, Sentence
6204 
6205 
6206  class FilterTest(unittest2.TestCase):
6207      """ Test of filters """
@@ -29,12 +32,12 @@
6208          """ Test occurence filter """
6209          text = 'Hello everyone, this is   me speaking. And me.'
6210          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6211                                      'me': 'http://example.com/me'})
6212          source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
6213 -        _filter = filters.NerOccurenceFilter(min_occ=2)
6214 -        ner = core.NerProcess((source1, source2), filters=(_filter,))
6215 +        _filter = NerOccurenceFilter(min_occ=2)
6216 +        ner = NerProcess((source1, source2), filters=(_filter,))
6217          named_entities = ner.process_text(text)
6218          self.assertEqual(named_entities,
6219                           [('http://example.com/me', None,
6220                             Token(word='me', start=26, end=28,
6221                                             sentence=Sentence(indice=0, start=0, end=38))),
@@ -52,12 +55,12 @@
6222          """ Test occurence filter """
6223          text = 'Hello everyone, this is   me speaking. And me.'
6224          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6225                                      'me': 'http://example.com/me'})
6226          source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
6227 -        _filter = filters.NerOccurenceFilter(max_occ=1)
6228 -        ner = core.NerProcess((source1, source2), filters=(_filter,))
6229 +        _filter = NerOccurenceFilter(max_occ=1)
6230 +        ner = NerProcess((source1, source2), filters=(_filter,))
6231          named_entities = ner.process_text(text)
6232          self.assertEqual(named_entities,
6233                           [('http://example.com/everyone', None,
6234                             Token(word='everyone', start=6, end=14,
6235                                             sentence=Sentence(indice=0, start=0, end=38))),])
@@ -65,12 +68,12 @@
6236      def test_disambiguation_word_length(self):
6237          """ Test occurence filter """
6238          text = 'Hello toto tutu. And toto.'
6239          source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
6240                                     'toto': 'http://example.com/toto'})
6241 -        _filter = filters.NerDisambiguationWordParts()
6242 -        ner = core.NerProcess((source,), filters=(_filter,))
6243 +        _filter = NerDisambiguationWordParts()
6244 +        ner = NerProcess((source,), filters=(_filter,))
6245          named_entities = ner.process_text(text)
6246          self.assertEqual(named_entities,
6247                           [('http://example.com/toto_tutu', None,
6248                             Token(word='toto tutu', start=6, end=15,
6249                                   sentence=Sentence(indice=0, start=0, end=16))),
@@ -82,12 +85,12 @@
6250          """ Test rules filter """
6251          text = 'Hello toto tutu. And toto.'
6252          source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
6253                                     'toto': 'http://example.com/toto'})
6254          rules = {'http://example.com/toto': 'http://example.com/tata'}
6255 -        _filter = filters.NerReplacementRulesFilter(rules)
6256 -        ner = core.NerProcess((source,), filters=(_filter,))
6257 +        _filter = NerReplacementRulesFilter(rules)
6258 +        ner = NerProcess((source,), filters=(_filter,))
6259          named_entities = ner.process_text(text)
6260          self.assertEqual(named_entities,
6261                           [('http://example.com/toto_tutu', None,
6262                             Token(word='toto tutu', start=6, end=15,
6263                                   sentence=Sentence(indice=0, start=0, end=16))),
diff --git a/test/test_named_entities.py b/test/test_named_entities.py
@@ -1,230 +0,0 @@
6264 -# -*- coding:utf-8 -*-
6265 -#
6266 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
6267 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
6268 -#
6269 -# This program is free software: you can redistribute it and/or modify it under
6270 -# the terms of the GNU Lesser General Public License as published by the Free
6271 -# Software Foundation, either version 2.1 of the License, or (at your option)
6272 -# any later version.
6273 -#
6274 -# This program is distributed in the hope that it will be useful, but WITHOUT
6275 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
6276 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
6277 -# details.
6278 -#
6279 -# You should have received a copy of the GNU Lesser General Public License along
6280 -# with this program. If not, see <http://www.gnu.org/licenses/>.
6281 -import unittest2
6282 -
6283 -from nazca.named_entities.sources import (NerSourceLexicon,
6284 -                                          NerSourceSparql,
6285 -                                          NerSourceRql)
6286 -from nazca.named_entities import NerProcess
6287 -from nazca.utils.tokenizer import Token, Sentence
6288 -from nazca.named_entities.preprocessors import NerStopwordsFilterPreprocessor
6289 -
6290 -
6291 -class NerTest(unittest2.TestCase):
6292 -    """ Test of Ner """
6293 -
6294 -    def test_lexicon_source(self):
6295 -        """ Test lexicon source """
6296 -        lexicon = {'everyone': 'http://example.com/everyone',
6297 -                   'me': 'http://example.com/me'}
6298 -        source = NerSourceLexicon(lexicon)
6299 -        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
6300 -        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
6301 -        self.assertEqual(source.query_word('me everyone'), [])
6302 -        self.assertEqual(source.query_word('toto'), [])
6303 -        # Token
6304 -        token = Token('me', 0, 2, None)
6305 -        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
6306 -        token = Token('ma', 0, 2, None)
6307 -        self.assertEqual(source.recognize_token(token), [])
6308 -
6309 -    def test_rql_source(self):
6310 -        """ Test rql source """
6311 -        source = NerSourceRql('http://www.cubicweb.org',
6312 -                              'Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"')
6313 -        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
6314 -
6315 -    def test_sparql_source(self):
6316 -        """ Test sparql source """
6317 -        source = NerSourceSparql(u'http://dbpedia.org/sparql',
6318 -                                 u'''SELECT DISTINCT ?uri
6319 -                                     WHERE{
6320 -                                     ?uri rdfs:label "%(word)s"@en .
6321 -                                     ?uri rdf:type ?type}''')
6322 -        self.assertEqual(source.query_word('Python'),
6323 -                         [u'http://dbpedia.org/resource/Python',
6324 -                          u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
6325 -                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
6326 -
6327 -    def test_ner_process(self):
6328 -        """ Test ner process """
6329 -        text = 'Hello everyone, this is   me speaking. And me.'
6330 -        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6331 -                                   'me': 'http://example.com/me'})
6332 -        ner = NerProcess((source,))
6333 -        named_entities = ner.process_text(text)
6334 -        self.assertEqual(named_entities,
6335 -                         [('http://example.com/everyone', None,
6336 -                           Token(word='everyone', start=6, end=14,
6337 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6338 -                          ('http://example.com/me', None,
6339 -                           Token(word='me', start=26, end=28,
6340 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6341 -                          ('http://example.com/me', None,
6342 -                           Token(word='me', start=43, end=45,
6343 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
6344 -
6345 -    def test_ner_process_multisources(self):
6346 -        """ Test ner process """
6347 -        text = 'Hello everyone, this is   me speaking. And me.'
6348 -        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6349 -                                    'me': 'http://example.com/me'})
6350 -        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
6351 -        # Two sources, not unique
6352 -        ner = NerProcess((source1, source2))
6353 -        named_entities = ner.process_text(text)
6354 -        self.assertEqual(named_entities,
6355 -                         [('http://example.com/everyone', None,
6356 -                           Token(word='everyone', start=6, end=14,
6357 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6358 -                          ('http://example.com/me', None,
6359 -                           Token(word='me', start=26, end=28,
6360 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6361 -                          ('http://example2.com/me', None,
6362 -                           Token(word='me', start=26, end=28,
6363 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6364 -                          ('http://example.com/me', None,
6365 -                           Token(word='me', start=43, end=45,
6366 -                                           sentence=Sentence(indice=1, start=38, end=46))),
6367 -                          ('http://example2.com/me', None,
6368 -                           Token(word='me', start=43, end=45,
6369 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
6370 -        # Two sources, unique
6371 -        ner = NerProcess((source1, source2), unique=True)
6372 -        named_entities = ner.process_text(text)
6373 -        self.assertEqual(named_entities,
6374 -                         [('http://example.com/everyone', None,
6375 -                           Token(word='everyone', start=6, end=14,
6376 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6377 -                          ('http://example.com/me', None,
6378 -                           Token(word='me', start=26, end=28,
6379 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6380 -                          ('http://example.com/me', None,
6381 -                           Token(word='me', start=43, end=45,
6382 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
6383 -        # Two sources inversed, unique
6384 -        ner = NerProcess((source2, source1), unique=True)
6385 -        named_entities = ner.process_text(text)
6386 -        self.assertEqual(named_entities,
6387 -                         [('http://example.com/everyone', None,
6388 -                           Token(word='everyone', start=6, end=14,
6389 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6390 -                          ('http://example2.com/me', None,
6391 -                           Token(word='me', start=26, end=28,
6392 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6393 -                          ('http://example2.com/me', None,
6394 -                           Token(word='me', start=43, end=45,
6395 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
6396 -
6397 -    def test_ner_process_add_sources(self):
6398 -        """ Test ner process """
6399 -        text = 'Hello everyone, this is   me speaking. And me.'
6400 -        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6401 -                                    'me': 'http://example.com/me'})
6402 -        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
6403 -        ner = NerProcess((source1,))
6404 -        named_entities = ner.process_text(text)
6405 -        self.assertEqual(named_entities,
6406 -                         [('http://example.com/everyone', None,
6407 -                           Token(word='everyone', start=6, end=14,
6408 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6409 -                          ('http://example.com/me', None,
6410 -                           Token(word='me', start=26, end=28,
6411 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6412 -                          ('http://example.com/me', None,
6413 -                           Token(word='me', start=43, end=45,
6414 -                                           sentence=Sentence(indice=1, start=38, end=46))),])
6415 -        # Two sources, not unique
6416 -        ner.add_ner_source(source2)
6417 -        named_entities = ner.process_text(text)
6418 -        self.assertEqual(named_entities,
6419 -                         [('http://example.com/everyone', None,
6420 -                           Token(word='everyone', start=6, end=14,
6421 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6422 -                          ('http://example.com/me', None,
6423 -                           Token(word='me', start=26, end=28,
6424 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6425 -                          ('http://example2.com/me', None,
6426 -                           Token(word='me', start=26, end=28,
6427 -                                           sentence=Sentence(indice=0, start=0, end=38))),
6428 -                          ('http://example.com/me', None,
6429 -                           Token(word='me', start=43, end=45,
6430 -                                           sentence=Sentence(indice=1, start=38, end=46))),
6431 -                          ('http://example2.com/me', None,
6432 -                           Token(word='me', start=43, end=45,
6433 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
6434 -
6435 -    def test_ner_process_preprocess(self):
6436 -        """ Test ner process """
6437 -        text = 'Hello Toto, this is   me speaking. And me.'
6438 -        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
6439 -                                   'me': 'http://example.com/me'})
6440 -        preprocessor = NerStopwordsFilterPreprocessor()
6441 -        ner = NerProcess((source,),
6442 -                                  preprocessors=(preprocessor,))
6443 -        named_entities = ner.process_text(text)
6444 -        self.assertEqual(named_entities, [('http://example.com/toto', None,
6445 -                                           Token(word='Toto', start=6, end=10,
6446 -                                                 sentence=Sentence(indice=0, start=0, end=34)))])
6447 -
6448 -    def test_ner_process_add_preprocess(self):
6449 -        """ Test ner process """
6450 -        text = 'Hello Toto, this is   me speaking. And me.'
6451 -        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
6452 -                                   'me': 'http://example.com/me'})
6453 -        preprocessor = NerStopwordsFilterPreprocessor()
6454 -        ner = NerProcess((source,),)
6455 -        named_entities = ner.process_text(text)
6456 -        self.assertEqual(named_entities,
6457 -                         [('http://example.com/toto', None,
6458 -                           Token(word='Toto', start=6, end=10,
6459 -                                 sentence=Sentence(indice=0, start=0, end=34))),
6460 -                          ('http://example.com/me', None,
6461 -                           Token(word='me', start=22, end=24,
6462 -                                 sentence=Sentence(indice=0, start=0, end=34))),
6463 -                          ('http://example.com/me', None,
6464 -                           Token(word='me', start=39, end=41,
6465 -                                 sentence=Sentence(indice=1, start=34, end=42)))])
6466 -        ner.add_preprocessors(preprocessor)
6467 -        named_entities = ner.process_text(text)
6468 -        self.assertEqual(named_entities, [('http://example.com/toto', None,
6469 -                                           Token(word='Toto', start=6, end=10,
6470 -                                                 sentence=Sentence(indice=0, start=0, end=34)))])
6471 -
6472 -    def test_ner_process_chained_word(self):
6473 -        """ Test ner process """
6474 -        text = 'Hello everyone me, this is   me speaking. And me.'
6475 -        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6476 -                                   'everyone me': 'http://example.com/everyone_me',
6477 -                                   'me': 'http://example.com/me'})
6478 -        ner = NerProcess((source,))
6479 -        named_entities = ner.process_text(text)
6480 -        self.assertEqual(named_entities,
6481 -                         [('http://example.com/everyone_me', None,
6482 -                           Token(word='everyone me', start=6, end=17,
6483 -                                 sentence=Sentence(indice=0, start=0, end=41))),
6484 -                          ('http://example.com/me', None,
6485 -                           Token(word='me', start=29, end=31,
6486 -                                 sentence=Sentence(indice=0, start=0, end=41))),
6487 -                          ('http://example.com/me', None,
6488 -                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
6489 -
6490 -
6491 -if __name__ == '__main__':
6492 -    unittest2.main()
6493 -
diff --git a/test/test_ner.py b/test/test_ner.py
@@ -0,0 +1,230 @@
6494 +# -*- coding:utf-8 -*-
6495 +#
6496 +# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
6497 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
6498 +#
6499 +# This program is free software: you can redistribute it and/or modify it under
6500 +# the terms of the GNU Lesser General Public License as published by the Free
6501 +# Software Foundation, either version 2.1 of the License, or (at your option)
6502 +# any later version.
6503 +#
6504 +# This program is distributed in the hope that it will be useful, but WITHOUT
6505 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
6506 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
6507 +# details.
6508 +#
6509 +# You should have received a copy of the GNU Lesser General Public License along
6510 +# with this program. If not, see <http://www.gnu.org/licenses/>.
6511 +import unittest2
6512 +
6513 +from nazca.ner.sources import (NerSourceLexicon,
6514 +                                          NerSourceSparql,
6515 +                                          NerSourceRql)
6516 +from nazca.ner import NerProcess
6517 +from nazca.utils.tokenizer import Token, Sentence
6518 +from nazca.ner.preprocessors import NerStopwordsFilterPreprocessor
6519 +
6520 +
6521 +class NerTest(unittest2.TestCase):
6522 +    """ Test of Ner """
6523 +
6524 +    def test_lexicon_source(self):
6525 +        """ Test lexicon source """
6526 +        lexicon = {'everyone': 'http://example.com/everyone',
6527 +                   'me': 'http://example.com/me'}
6528 +        source = NerSourceLexicon(lexicon)
6529 +        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
6530 +        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
6531 +        self.assertEqual(source.query_word('me everyone'), [])
6532 +        self.assertEqual(source.query_word('toto'), [])
6533 +        # Token
6534 +        token = Token('me', 0, 2, None)
6535 +        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
6536 +        token = Token('ma', 0, 2, None)
6537 +        self.assertEqual(source.recognize_token(token), [])
6538 +
6539 +    def test_rql_source(self):
6540 +        """ Test rql source """
6541 +        source = NerSourceRql('http://www.cubicweb.org',
6542 +                              'Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"')
6543 +        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
6544 +
6545 +    def test_sparql_source(self):
6546 +        """ Test sparql source """
6547 +        source = NerSourceSparql(u'http://dbpedia.org/sparql',
6548 +                                 u'''SELECT DISTINCT ?uri
6549 +                                     WHERE{
6550 +                                     ?uri rdfs:label "%(word)s"@en .
6551 +                                     ?uri rdf:type ?type}''')
6552 +        self.assertEqual(source.query_word('Python'),
6553 +                         [u'http://dbpedia.org/resource/Python',
6554 +                          u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
6555 +                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
6556 +
6557 +    def test_ner_process(self):
6558 +        """ Test ner process """
6559 +        text = 'Hello everyone, this is   me speaking. And me.'
6560 +        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6561 +                                   'me': 'http://example.com/me'})
6562 +        ner = NerProcess((source,))
6563 +        named_entities = ner.process_text(text)
6564 +        self.assertEqual(named_entities,
6565 +                         [('http://example.com/everyone', None,
6566 +                           Token(word='everyone', start=6, end=14,
6567 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6568 +                          ('http://example.com/me', None,
6569 +                           Token(word='me', start=26, end=28,
6570 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6571 +                          ('http://example.com/me', None,
6572 +                           Token(word='me', start=43, end=45,
6573 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
6574 +
6575 +    def test_ner_process_multisources(self):
6576 +        """ Test ner process """
6577 +        text = 'Hello everyone, this is   me speaking. And me.'
6578 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6579 +                                    'me': 'http://example.com/me'})
6580 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
6581 +        # Two sources, not unique
6582 +        ner = NerProcess((source1, source2))
6583 +        named_entities = ner.process_text(text)
6584 +        self.assertEqual(named_entities,
6585 +                         [('http://example.com/everyone', None,
6586 +                           Token(word='everyone', start=6, end=14,
6587 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6588 +                          ('http://example.com/me', None,
6589 +                           Token(word='me', start=26, end=28,
6590 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6591 +                          ('http://example2.com/me', None,
6592 +                           Token(word='me', start=26, end=28,
6593 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6594 +                          ('http://example.com/me', None,
6595 +                           Token(word='me', start=43, end=45,
6596 +                                           sentence=Sentence(indice=1, start=38, end=46))),
6597 +                          ('http://example2.com/me', None,
6598 +                           Token(word='me', start=43, end=45,
6599 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
6600 +        # Two sources, unique
6601 +        ner = NerProcess((source1, source2), unique=True)
6602 +        named_entities = ner.process_text(text)
6603 +        self.assertEqual(named_entities,
6604 +                         [('http://example.com/everyone', None,
6605 +                           Token(word='everyone', start=6, end=14,
6606 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6607 +                          ('http://example.com/me', None,
6608 +                           Token(word='me', start=26, end=28,
6609 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6610 +                          ('http://example.com/me', None,
6611 +                           Token(word='me', start=43, end=45,
6612 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
6613 +        # Two sources inversed, unique
6614 +        ner = NerProcess((source2, source1), unique=True)
6615 +        named_entities = ner.process_text(text)
6616 +        self.assertEqual(named_entities,
6617 +                         [('http://example.com/everyone', None,
6618 +                           Token(word='everyone', start=6, end=14,
6619 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6620 +                          ('http://example2.com/me', None,
6621 +                           Token(word='me', start=26, end=28,
6622 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6623 +                          ('http://example2.com/me', None,
6624 +                           Token(word='me', start=43, end=45,
6625 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
6626 +
6627 +    def test_ner_process_add_sources(self):
6628 +        """ Test ner process """
6629 +        text = 'Hello everyone, this is   me speaking. And me.'
6630 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6631 +                                    'me': 'http://example.com/me'})
6632 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
6633 +        ner = NerProcess((source1,))
6634 +        named_entities = ner.process_text(text)
6635 +        self.assertEqual(named_entities,
6636 +                         [('http://example.com/everyone', None,
6637 +                           Token(word='everyone', start=6, end=14,
6638 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6639 +                          ('http://example.com/me', None,
6640 +                           Token(word='me', start=26, end=28,
6641 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6642 +                          ('http://example.com/me', None,
6643 +                           Token(word='me', start=43, end=45,
6644 +                                           sentence=Sentence(indice=1, start=38, end=46))),])
6645 +        # Two sources, not unique
6646 +        ner.add_ner_source(source2)
6647 +        named_entities = ner.process_text(text)
6648 +        self.assertEqual(named_entities,
6649 +                         [('http://example.com/everyone', None,
6650 +                           Token(word='everyone', start=6, end=14,
6651 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6652 +                          ('http://example.com/me', None,
6653 +                           Token(word='me', start=26, end=28,
6654 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6655 +                          ('http://example2.com/me', None,
6656 +                           Token(word='me', start=26, end=28,
6657 +                                           sentence=Sentence(indice=0, start=0, end=38))),
6658 +                          ('http://example.com/me', None,
6659 +                           Token(word='me', start=43, end=45,
6660 +                                           sentence=Sentence(indice=1, start=38, end=46))),
6661 +                          ('http://example2.com/me', None,
6662 +                           Token(word='me', start=43, end=45,
6663 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
6664 +
6665 +    def test_ner_process_preprocess(self):
6666 +        """ Test ner process """
6667 +        text = 'Hello Toto, this is   me speaking. And me.'
6668 +        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
6669 +                                   'me': 'http://example.com/me'})
6670 +        preprocessor = NerStopwordsFilterPreprocessor()
6671 +        ner = NerProcess((source,),
6672 +                                  preprocessors=(preprocessor,))
6673 +        named_entities = ner.process_text(text)
6674 +        self.assertEqual(named_entities, [('http://example.com/toto', None,
6675 +                                           Token(word='Toto', start=6, end=10,
6676 +                                                 sentence=Sentence(indice=0, start=0, end=34)))])
6677 +
6678 +    def test_ner_process_add_preprocess(self):
6679 +        """ Test ner process """
6680 +        text = 'Hello Toto, this is   me speaking. And me.'
6681 +        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
6682 +                                   'me': 'http://example.com/me'})
6683 +        preprocessor = NerStopwordsFilterPreprocessor()
6684 +        ner = NerProcess((source,),)
6685 +        named_entities = ner.process_text(text)
6686 +        self.assertEqual(named_entities,
6687 +                         [('http://example.com/toto', None,
6688 +                           Token(word='Toto', start=6, end=10,
6689 +                                 sentence=Sentence(indice=0, start=0, end=34))),
6690 +                          ('http://example.com/me', None,
6691 +                           Token(word='me', start=22, end=24,
6692 +                                 sentence=Sentence(indice=0, start=0, end=34))),
6693 +                          ('http://example.com/me', None,
6694 +                           Token(word='me', start=39, end=41,
6695 +                                 sentence=Sentence(indice=1, start=34, end=42)))])
6696 +        ner.add_preprocessors(preprocessor)
6697 +        named_entities = ner.process_text(text)
6698 +        self.assertEqual(named_entities, [('http://example.com/toto', None,
6699 +                                           Token(word='Toto', start=6, end=10,
6700 +                                                 sentence=Sentence(indice=0, start=0, end=34)))])
6701 +
6702 +    def test_ner_process_chained_word(self):
6703 +        """ Test ner process """
6704 +        text = 'Hello everyone me, this is   me speaking. And me.'
6705 +        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
6706 +                                   'everyone me': 'http://example.com/everyone_me',
6707 +                                   'me': 'http://example.com/me'})
6708 +        ner = NerProcess((source,))
6709 +        named_entities = ner.process_text(text)
6710 +        self.assertEqual(named_entities,
6711 +                         [('http://example.com/everyone_me', None,
6712 +                           Token(word='everyone me', start=6, end=17,
6713 +                                 sentence=Sentence(indice=0, start=0, end=41))),
6714 +                          ('http://example.com/me', None,
6715 +                           Token(word='me', start=29, end=31,
6716 +                                 sentence=Sentence(indice=0, start=0, end=41))),
6717 +                          ('http://example.com/me', None,
6718 +                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
6719 +
6720 +
6721 +if __name__ == '__main__':
6722 +    unittest2.main()
6723 +
diff --git a/test/test_preprocessors.py b/test/test_preprocessors.py
@@ -16,11 +16,11 @@
6724  # You should have received a copy of the GNU Lesser General Public License along
6725  # with this program. If not, see <http://www.gnu.org/licenses/>.
6726  import unittest2
6727 
6728  from nazca.utils import tokenizer
6729 -from nazca.named_entities import preprocessors
6730 +from nazca.ner import preprocessors
6731 
6732 
6733  class PreprocessorTest(unittest2.TestCase):
6734      """ Test of preprocessors """
6735