(
int i= 0;
i< 256; ++
i) {
107 stringclause_ends(
".?!;:\"{}[]()");
108 ITERATE(
string, it, clause_ends) {
139 boolis_alpha =
false;
160string::size_type
i)
162 for( ;
i< s.size(); ++
i) {
167 return(
i== s.size() ? string::npos :
i);
171string::size_type
i)
173 for( ;
i< s.size(); ++
i) {
178 return(
i== s.size() ? string::npos :
i);
182string::size_type
i)
184 for( ;
i< s.size(); ++
i) {
189 return(
i== s.size() ? string::npos :
i);
201 if(iter->first[0] ==
'p'&& iter->first.find(
"phrase: ") == 0) {
202phrase_out.
insert(phrase_out.
end(), *iter);
218freq.
Add(iter->first, iter->second);
224 const string& prefix,
228 if(iter->first.find_first_of(
":") != string::npos) {
234freq.
Add(prefix +
": "+ iter->first, iter->second);
340 _TRACE(
"CTextUtil::GetWordFrequencies(): text = "<<
text);
341string::size_type clause_start = 0;
342string::size_type clause_end =
text.size();
344list<string> prev_words;
350 while(clause_start != clause_end) {
351clause_end =
text.size();
353 if(pos != string::npos) {
358 _TRACE(
"clause: |"<<
text.substr(clause_start, clause_end - clause_start) <<
"|");
359 for( ; clause_start != clause_end; clause_start = pos) {
363 if(clause_start == clause_end) {
368pos =
min(clause_end,
376word.assign(
text, clause_start, pos - clause_start);
379string::size_type pos1 =
380word.find_first_not_of(
"0123456789");
381 if(pos1 == string::npos) {
390string::iterator copy_to = word.begin();
392 if(*copy_from ==
'\'') {
397 if(copy_to != word.end()) {
398word.erase(copy_to, word.end());
409 _TRACE(
" word: "<< word);
413 typedefpair<string, string> TDiphPair;
414 static constTDiphPair sc_DiphPairs[] = {
415TDiphPair(
"oe",
"e"),
416TDiphPair(
"ae",
"e")
420 for(
size_t i= 0;
i<
sizeof(sc_DiphPairs) /
sizeof(TDiphPair); ++
i) {
421 if(word.find(sc_DiphPairs[
i].first) != string::npos) {
423sc_DiphPairs[
i].
first,
424sc_DiphPairs[
i].second,
441prev_words.push_back(word);
444prev_words.push_back(stem);
449 while(prev_words.size() > 3) {
450prev_words.pop_front();
452 if(prev_words.size() > 1) {
453list<string>::iterator pit = prev_words.begin();
454list<string>::iterator end = prev_words.end();
456 for( ; pit != end; ++pit) {
465phrase =
"phrase: ";
476 _TRACE(
" phrase: |"<< phrase <<
"|");
478freq.
Add(phrase, 1);
495 if(clause_start == string::npos) {
502 _TRACE(
" word: "<< it->first <<
" count: "<< it->second);
514 if(iter->first.find_first_of(
":") != string::npos) {
521 if(it != stem_freq.
end()) {
522it->second += iter->second;
655 return(iter != sc_StopWords.end());
668 for( ; stop_it != stop_end && it != end; ) {
669 if(it->first == *stop_it) {
673 if(it->first < *stop_it) {
685string::size_type pos = 0;
686 while( (pos = title.find_first_of(
".,[](){};:'\"/?<>", pos)) != string::npos) {
695vector<unsigned char>&
data)
716 constvector<unsigned char>&
data)
723 constvector<char>&
data)
737 const void*
data,
size_tdata_len)
static void Stem(const string &in_str, string *out_str)
Compute the Porter stem for a given word.
iterator find(const Key &key)
pair< iterator, bool > insert(const value_type &val)
void Add(Key idx, Score weight=Score(1))
TVector::value_type value_type
TVector::iterator iterator
Reallocable memory buffer (no memory copy overhead) Mimics vector<>, without the overhead of explicit...
TBase::const_iterator const_iterator
static void GetStemFrequencies(const TWordFreq &freq, TWordFreq &stems, TFlags flags=fDefaults)
retrieve stem frequencies from a set of word frequencies
static bool IsStopWord(const string &str)
return true if the provided word is a stop word
static void TrimStopWords(TWordFreq &freq)
eliminate the stop words frm a set of word frequencies
static void CleanJournalTitle(string &title)
perform a set of punctuational clean-ups on a string suitable for a journal or book title
static void SplitWordFrequencies(const TWordFreq &wf_in, TWordFreq &wf_out, TWordFreq &phrase_out)
split a set of word frequencies into phrase and non-phrase frequencies this is done to treat the two ...
static void EncodeFreqs(const TWordFreq &freq, vector< char > &data)
static void AddWordFrequencies(TWordFreq &freq, const TWordFreq &wf, TFlags flags=0)
add a set of frequencies into another set
static void GetWordFrequencies(const string &text, TWordFreq &freq, TFlags flags=fDefaults)
retrieve word frequencies for a given piece of text
static void DecodeFreqs(TWordFreq &freq, const vector< char > &data)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static const char * str(char *buf, int n)
double wf(double lambda, double D_LR, double D_LU, double D_LD, double D_RU, double D_RD, double D_DU)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
uint16_t Uint2
2-byte (16-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static string Int8ToString(Int8 value, TNumToStringFlags flags=0, int base=10)
Convert Int8 to string.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
static string & ToLower(string &str)
Convert string to lower case â string& version.
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
unsigned int
A callback function used to compare two keys in a database.
static void text(MDB_val *v)
CStaticArraySet< const char *, PCase_CStr > TStopWords
static string::size_type s_NextClauseStop(const string &s, string::size_type i)
static bool s_IsNumeric(unsigned char c)
static SLoadTokens s_ForceTokenLoad
void s_NumericToFreq(const T &val, CTextUtil::TWordFreq &freq)
static const char *const sc_StopWordArray[]
Stop Word Pruning.
static bool s_IsAlphaNumeric(unsigned char c)
string s_ValToString(Int4 i)
static string::size_type s_NextTokenStart(const string &s, string::size_type i)
static char s_ToLower(unsigned char c)
DEFINE_STATIC_ARRAY_MAP(TStopWords, sc_StopWords, sc_StopWordArray)
static string::size_type s_NextTokenStop(const string &s, string::size_type i)
static Uint2 sc_Tokens[256]
void Encode(const CRawScoreVector< Key, Score > &, vector< char > &)
void Decode(const vector< char > &, CRawScoreVector< Key, Score > &)
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4