Find top k frequent words with map reduce framework.
The mapper's key is the document id, value is the content of the document, words in a document are split by spaces.
For reducer, the output should be at most k key-value pairs, which are the top k words and their frequencies in this reducer. The judge will take care about how to merge different reducers' results to get the global top k frequent words, so you don't need to care about that part.
The_k_is given in the constructor of TopK class.
Notice
For the words with same frequency, rank them with alphabet.
Have you met this question in a real interview?
Yes
Example
Given document A =
lintcode is the best online judge
I love lintcode
and document B =
lintcode is an online judge for coding interview
you can test your code online at lintcode
The top 2 words and their frequencies should be
lintcode, 4
online, 3
/**
* Definition of Input:
* template<class T>
* class Input {
* public:
* bool done();
* // Returns true if the iteration has elements or false.
* void next();
* // Move to the next element in the iteration
* // Runtime error if the iteration has no more elements
* T value();
* // Get the current element, Runtime error if
* // the iteration has no more elements
* }
* Definition of Document:
* class Document {
* public:
* int id; // document id
* string content; // document content
* }
*/
class TopKFrequentWordsMapper: public Mapper {
public:
void Map(Input<Document>* input) {
// Write your code here
// Please directly use func 'output' to output
// the results into output buffer.
// void output(string &key, int value);
while (!input->done()) {
vector<string> words = split(input->value().content, " ");
for (string& word : words) {
if (!word.empty())
output(word, 1);
}
input->next();
}
}
private:
vector<string> split(const string& value, string delim) {
vector<string> words;
int last = 0, index;
while ((index = value.find(delim, last)) != string::npos) {
words.push_back(value.substr(last, index - last));
last = index + delim.length();
}
if (last < value.length()) {
words.push_back(value.substr(last, value.length() - last));
}
return words;
}
};
struct Pair {
Pair(string key, int times) {
this->key = key;
this->times = times;
}
string key;
int times;
bool operator< (const Pair& other) const {
return times > other.times || (times == other.times && key < other.key);
}
};
class TopKFrequentWordsReducer: public Reducer {
private:
int k;
priority_queue<Pair> q;
public:
void setUp(int k) {
// initialize your data structure here
this->k = k;
}
void Reduce(string &key, Input<int>* input) {
// Write your code here
int sum = 0;
while (!input->done()) {
sum += input->value();
input->next();
}
Pair pair(key, sum);
if (q.size() < k) {
q.push(pair);
} else {
q.push(pair);
q.pop();
}
}
void cleanUp() {
// Please directly use func 'output' to output
// the top k pairs <word, times> into output buffer.
// void output(string &key, int &value);
vector<Pair> list;
while (!q.empty()) {
list.push_back(q.top());
q.pop();
}
for (auto it = list.rbegin(); it != list.rend(); ++it) {
output(it->key, it->times);
}
// for (int i = list.size() -1 ; i >= 0; --i) {
// output(list[i].key, list[i].times);
// }
}
};