RetroSearch Browse

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Showing content from https://github.com/scikit-learn/scikit-learn/commit/de6e93094499e4d81b8e3b15fc66b6b9252945af below:

Revert "BUG fixed and cosmetics in CountVectorizer" · scikit-learn/scikit-learn@de6e930 · GitHub

File tree Expand file treeCollapse file tree 2 files changed

+12

-21

lines changed

Filter options

Expand file treeCollapse file tree 2 files changed

+12

-21

lines changed Original file line number Diff line number Diff line change


 @@ -318,27 +318,29 @@ def fit_transform(self, raw_documents, y=None):

318 318


 # TODO: parallelize the following loop with joblib?

319 319


 # (see XXX up ahead)

320 320


 for doc in raw_documents:

321

-
 term_count_current = Counter(self.analyzer.analyze(doc))

322

-
 term_counts += term_count_current

321

+
 term_count_current = Counter()

323 322 324

-
 if max_df < 1.0:

325

-
 document_counts.update(term_count_current)

323

+
 for term in self.analyzer.analyze(doc):

324

+
 term_count_current[term] += 1

325

+
 term_counts[term] += 1

326 + 327

+
 if max_df is not None:

328

+
 for term in term_count_current:

329

+
 document_counts[term] += 1

326 330 327 331


 term_counts_per_doc.append(term_count_current)

328 332 329 333


 n_doc = len(term_counts_per_doc)

330 334 331 335


 # filter out stop words: terms that occur in almost all documents

332

-
 if max_df < 1.0:

336

+
 if max_df is not None:

333 337


 max_document_count = max_df * n_doc

334 338


 stop_words = set(t for t, dc in document_counts.iteritems()

335 339


 if dc > max_document_count)

336

-
 else:

337

-
 stop_words = set()

338 340 339 341


 # list the terms that should be part of the vocabulary

340 342


 if max_features is None:

341

-
 terms = set(term_counts) - stop_words

343

+
 terms = [t for t in term_counts if t not in stop_words]

342 344


 else:

343 345


 # extract the most frequent terms for the vocabulary

344 346


 terms = set()

Original file line number Diff line number Diff line change


 @@ -24,25 +24,14 @@ def product(*args, **kwds):

24 24


 try:

25 25


 Counter = collections.Counter

26 26


 except AttributeError:

27

-
 # Partial replacement for Python 2.7 collections.Counter

27

+
 # Partial replacement for Python 2.7 Counter

28 28


 class Counter(collections.defaultdict):

29

-
 def __init__(self, iterable=(), **kwargs):

29

+
 def __init__(self, **kwargs):

30 30


 super(Counter, self).__init__(int, **kwargs)

31

-
 self.update(iterable)

32 - 33

-
 def __iadd__(self, other):

34

-
 """self += other; adds counts for elements in other"""

35

-
 for x, n in other.iteritems():

36

-
 self[x] += n

37

-
 return self

38 31 39 32


 def most_common(self):

40 33


 return sorted(self.iteritems(), key=itemgetter(1), reverse=True)

41 34 42

-
 def update(self, iterable):

43

-
 for x in iterable:

44

-
 self[x] += 1

45 - 46 35 47 36


 def _unique(ar, return_index=False, return_inverse=False):

48 37


 """A replacement for the np.unique that appeared in numpy 1.4.

You can’t perform that action at this time.

RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4