+12
-21
lines changedFilter options
+12
-21
lines changed Original file line number Diff line number Diff line change
@@ -318,27 +318,29 @@ def fit_transform(self, raw_documents, y=None):
318
318
# TODO: parallelize the following loop with joblib?
319
319
# (see XXX up ahead)
320
320
for doc in raw_documents:
321
-
term_count_current = Counter(self.analyzer.analyze(doc))
322
-
term_counts += term_count_current
321
+
term_count_current = Counter()
323
322
324
-
if max_df < 1.0:
325
-
document_counts.update(term_count_current)
323
+
for term in self.analyzer.analyze(doc):
324
+
term_count_current[term] += 1
325
+
term_counts[term] += 1
326
+
327
+
if max_df is not None:
328
+
for term in term_count_current:
329
+
document_counts[term] += 1
326
330
327
331
term_counts_per_doc.append(term_count_current)
328
332
329
333
n_doc = len(term_counts_per_doc)
330
334
331
335
# filter out stop words: terms that occur in almost all documents
332
-
if max_df < 1.0:
336
+
if max_df is not None:
333
337
max_document_count = max_df * n_doc
334
338
stop_words = set(t for t, dc in document_counts.iteritems()
335
339
if dc > max_document_count)
336
-
else:
337
-
stop_words = set()
338
340
339
341
# list the terms that should be part of the vocabulary
340
342
if max_features is None:
341
-
terms = set(term_counts) - stop_words
343
+
terms = [t for t in term_counts if t not in stop_words]
342
344
else:
343
345
# extract the most frequent terms for the vocabulary
344
346
terms = set()
Original file line number Diff line number Diff line change
@@ -24,25 +24,14 @@ def product(*args, **kwds):
24
24
try:
25
25
Counter = collections.Counter
26
26
except AttributeError:
27
-
# Partial replacement for Python 2.7 collections.Counter
27
+
# Partial replacement for Python 2.7 Counter
28
28
class Counter(collections.defaultdict):
29
-
def __init__(self, iterable=(), **kwargs):
29
+
def __init__(self, **kwargs):
30
30
super(Counter, self).__init__(int, **kwargs)
31
-
self.update(iterable)
32
-
33
-
def __iadd__(self, other):
34
-
"""self += other; adds counts for elements in other"""
35
-
for x, n in other.iteritems():
36
-
self[x] += n
37
-
return self
38
31
39
32
def most_common(self):
40
33
return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
41
34
42
-
def update(self, iterable):
43
-
for x in iterable:
44
-
self[x] += 1
45
-
46
35
47
36
def _unique(ar, return_index=False, return_inverse=False):
48
37
"""A replacement for the np.unique that appeared in numpy 1.4.
You can’t perform that action at this time.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4