[minhashing] Correctly build k grams in minhashing, closes #221665

authorVincent Michel <vincent.michel@logilab.fr>
changeset9cb4aaf1a111
branchdefault
phasepublic
hiddenno
parent revision#3d6d250a4fa5 Added tag nazca-debian-version-0.4.0 for changeset 7b24f8c32d34
child revision#42092b0421da preparing 0.4.1
files modified by this revision
test/test_minhashing.py
utils/minhashing.py
utils/normalize.py
# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1394447438 0
# Mon Mar 10 10:30:38 2014 +0000
# Node ID 9cb4aaf1a1111b75ad28237b199e6bdc8545aedb
# Parent 3d6d250a4fa5fcbf070457ff2c8496160fd3244b
[minhashing] Correctly build k grams in minhashing, closes #221665

diff --git a/test/test_minhashing.py b/test/test_minhashing.py
@@ -13,25 +13,43 @@
1  # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
2  # details.
3  #
4  # You should have received a copy of the GNU Lesser General Public License along
5  # with this program. If not, see <http://www.gnu.org/licenses/>.
6 -
7 +from functools import partial
8  import unittest2
9  from os import path
10  import random
11  random.seed(6) ### Make sure tests are repeatable
12 
13  from nazca.utils.normalize import simplify
14 -from nazca.utils.minhashing import Minlsh
15 +from nazca.utils.minhashing import Minlsh, count_vectorizer_func
16  from nazca.data import FRENCH_LEMMAS
17 
18  TESTDIR = path.dirname(__file__)
19 
20 
21 
22  class MinLSHTest(unittest2.TestCase):
23 +
24 +    def test_iter_wordgrams(self):
25 +        sentence = 'nom de la rose'
26 +        minlsh = Minlsh()
27 +        results = list(minlsh._iter_wordgrams(sentence, 2))
28 +        truth = ['nom de', 'nom', 'de la', 'de', 'la rose', 'la', 'rose']
29 +        self.assertEqual(len(results), len(truth))
30 +        self.assertEqual(set(results), set(truth))
31 +
32 +    def test_iter_wordgrams_sklearn(self):
33 +        sentences = ('nom de la rose', 'nom de la')
34 +        tokenizer_func = partial(count_vectorizer_func, min_n=1, max_n=2)
35 +        minlsh = Minlsh(tokenizer_func=tokenizer_func)
36 +        rows, shape = list(minlsh._buildmatrixdocument(sentences, 2))
37 +        self.assertEqual(shape, (2, 7))
38 +        self.assertEqual(rows[0], [0, 1, 2, 3, 4, 5, 6])
39 +        self.assertEqual(rows[1], [0, 1, 2, 4, 5])
40 +
41      def test_all(self):
42          sentences = [u"Un nuage flotta dans le grand ciel bleu.",
43                       u"Des grands nuages noirs flottent dans le ciel.",
44                       u"Je n'aime pas ce genre de bandes dessinées tristes.",
45                       u"J'aime les bandes dessinées de genre comiques.",
diff --git a/utils/minhashing.py b/utils/minhashing.py
@@ -21,13 +21,14 @@
46  from collections import defaultdict
47 
48  import numpy as np
49  from scipy.optimize import bisect
50 
51 -from nazca.utils.normalize import iter_wordgrams
52 
53 -
54 +###############################################################################
55 +### UTILITY FUNCTIONS #########################################################
56 +###############################################################################
57  def randomhashfunction(zr):
58      """ Return a random hash function, mapping x in Z to ZR
59          h:x -> ax + b mod R
60 
61      """
@@ -38,36 +39,69 @@
62      def hashfunc(x):
63          return ((a*x + b)%zr)
64 
65      return hashfunc
66 
67 +def count_vectorizer_func(sentences, min_n, max_n):
68 +    """ Perform a tokenization using scikit learn
69 +    """
70 +    from sklearn.feature_extraction.text import CountVectorizer
71 +    count_vec = CountVectorizer(min_n=min_n, max_n=max_n)
72 +    # Transform and convert to lil to get rows
73 +    data = count_vec.fit_transform(sentences).tolil()
74 +    return [list(l) for l in data.rows], data.shape
75 
76 +
77 +###############################################################################
78 +### MINHASHING ################################################################
79 +###############################################################################
80  class Minlsh(object):
81      """ Operate minhashing + locally-sensitive-hashing to find similar sentences
82      """
83 
84 -    def __init__(self, verbose=False):
85 +    def __init__(self, tokenizer_func=None, verbose=False):
86 +        """ Initialize a minhashing/lsh object
87 +
88 +        Parameters:
89 +        ==========
90 +
91 +           * tokenizer_func is a function that take the sentences
92 +             as argument and return the rows of the sparse matrix
93 +             of tokens, and its shape.
94 +
95 +           * verbose is a boolean that trigger the display of
96 +             some informations
97 +        """
98          self._trained = False
99          self.sigmatrix = None
100 +        if tokenizer_func:
101 +            # Use given tokenizer_func
102 +            self._buildmatrixdocument = lambda x, y: tokenizer_func(x)
103          self._verbose = verbose
104 
105      def train(self, sentences, k=2, siglen=200):
106          """ Train the minlsh on the given sentences.
107 
108              - `k` is the length of the k-wordgrams used
109                (the lower k is, the faster is the training)
110              - `siglen` the length of the sentences signature
111 
112          """
113 -
114          rows, shape = self._buildmatrixdocument(sentences, k)
115 -
116 -        if self._verbose: print "Training is done. Wait while signaturing"
117 -
118 +        if self._verbose:
119 +            print "Training is done. Wait while signaturing"
120          self._computesignaturematrix(rows, shape, siglen)
121          self._trained = True
122 
123 +    def _iter_wordgrams(self, sentence, k):
124 +        """ Generator of k-wordgrams on the given sentence
125 +        """
126 +        words = sentence.split(' ')
127 +        for r in xrange(len(words)):
128 +            for width in range(1, k+1):
129 +                if r+width<=len(words):
130 +                    yield ' '.join(words[r:r + width])
131 
132      def _buildmatrixdocument(self, sentences, k):
133          """ Return a sparse matrix where :
134 
135              - Each sentence is a column
@@ -75,15 +109,15 @@
136 
137              Each value (r, c) is set to 1 if the element at row r is in the
138              sentence c, 0 otherwise
139 
140          """
141 -
142 +        # Use default mode
143          rows, universe, sizeofuniverse = [], {}, 0
144          for nb, sent in enumerate(sentences):
145              row = []
146 -            for w in iter_wordgrams(sent, k):
147 +            for w in self._iter_wordgrams(sent, k):
148                  row.append(universe.setdefault(w, sizeofuniverse))
149                  if row[-1] == sizeofuniverse:
150                      sizeofuniverse += 1
151              rows.append(row)
152              if self._verbose and nb % 50000 == 0:
diff --git a/utils/normalize.py b/utils/normalize.py
@@ -138,18 +138,10 @@
153              chunks.append(schunks[-1])
154          else:
155              chunks.append(chunk)
156      return chunks
157 
158 -def iter_wordgrams(sentence, k):
159 -    """ Generator of k-wordgrams on the given sentence
160 -    """
161 -    words = sentence.split(' ')
162 -    #XXX Call tokenizer
163 -    for r in xrange(len(words)):
164 -        yield ' '.join(words[r:r + k])
165 -
166  def lemmatized(sentence, lemmas, tokenizer=None):
167      """ Return the lemmatized sentence
168      """
169      tokenized_sent = tokenize(sentence, tokenizer)
170      tokenized_sentformated = []