A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from https://github.com/scikit-learn/scikit-learn/commit/de6e93094499e4d81b8e3b15fc66b6b9252945af below:

Revert "BUG fixed and cosmetics in CountVectorizer" · scikit-learn/scikit-learn@de6e930 · GitHub

File tree Expand file treeCollapse file tree 2 files changed

+12

-21

lines changed

Filter options

Expand file treeCollapse file tree 2 files changed

+12

-21

lines changed Original file line number Diff line number Diff line change

@@ -318,27 +318,29 @@ def fit_transform(self, raw_documents, y=None):

318 318

# TODO: parallelize the following loop with joblib?

319 319

# (see XXX up ahead)

320 320

for doc in raw_documents:

321 -

term_count_current = Counter(self.analyzer.analyze(doc))

322 -

term_counts += term_count_current

321 +

term_count_current = Counter()

323 322 324 -

if max_df < 1.0:

325 -

document_counts.update(term_count_current)

323 +

for term in self.analyzer.analyze(doc):

324 +

term_count_current[term] += 1

325 +

term_counts[term] += 1

326 + 327 +

if max_df is not None:

328 +

for term in term_count_current:

329 +

document_counts[term] += 1

326 330 327 331

term_counts_per_doc.append(term_count_current)

328 332 329 333

n_doc = len(term_counts_per_doc)

330 334 331 335

# filter out stop words: terms that occur in almost all documents

332 -

if max_df < 1.0:

336 +

if max_df is not None:

333 337

max_document_count = max_df * n_doc

334 338

stop_words = set(t for t, dc in document_counts.iteritems()

335 339

if dc > max_document_count)

336 -

else:

337 -

stop_words = set()

338 340 339 341

# list the terms that should be part of the vocabulary

340 342

if max_features is None:

341 -

terms = set(term_counts) - stop_words

343 +

terms = [t for t in term_counts if t not in stop_words]

342 344

else:

343 345

# extract the most frequent terms for the vocabulary

344 346

terms = set()

Original file line number Diff line number Diff line change

@@ -24,25 +24,14 @@ def product(*args, **kwds):

24 24

try:

25 25

Counter = collections.Counter

26 26

except AttributeError:

27 -

# Partial replacement for Python 2.7 collections.Counter

27 +

# Partial replacement for Python 2.7 Counter

28 28

class Counter(collections.defaultdict):

29 -

def __init__(self, iterable=(), **kwargs):

29 +

def __init__(self, **kwargs):

30 30

super(Counter, self).__init__(int, **kwargs)

31 -

self.update(iterable)

32 - 33 -

def __iadd__(self, other):

34 -

"""self += other; adds counts for elements in other"""

35 -

for x, n in other.iteritems():

36 -

self[x] += n

37 -

return self

38 31 39 32

def most_common(self):

40 33

return sorted(self.iteritems(), key=itemgetter(1), reverse=True)

41 34 42 -

def update(self, iterable):

43 -

for x in iterable:

44 -

self[x] += 1

45 - 46 35 47 36

def _unique(ar, return_index=False, return_inverse=False):

48 37

"""A replacement for the np.unique that appeared in numpy 1.4.

You can’t perform that action at this time.


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4