Find top k frequent words with map reduce framework.

The mapper's key is the document id, value is the content of the document, words in a document are split by spaces.

For reducer, the output should be at most k key-value pairs, which are the top k words and their frequencies in this reducer. The judge will take care about how to merge different reducers' results to get the global top k frequent words, so you don't need to care about that part.

The_k_is given in the constructor of TopK class.

Notice

For the words with same frequency, rank them with alphabet.

Have you met this question in a real interview?

Yes

Example

Given document A =

lintcode is the best online judge
I love lintcode

and document B =

lintcode is an online judge for coding interview
you can test your code online at lintcode

The top 2 words and their frequencies should be

lintcode, 4
online, 3
/**
 * Definition of Input:
 * template<class T>
 * class Input {
 * public:
 *     bool done(); 
 *         // Returns true if the iteration has elements or false.
 *     void next();
 *         // Move to the next element in the iteration
 *         // Runtime error if the iteration has no more elements
 *     T value();
 *        // Get the current element, Runtime error if
 *        // the iteration has no more elements
 * }
 * Definition of Document:
 * class Document {
 * public:
 *     int id; // document id
 *     string content; // document content
 * }
 */
class TopKFrequentWordsMapper: public Mapper {
public:
    void Map(Input<Document>* input) {
        // Write your code here
        // Please directly use func 'output' to output 
        // the results into output buffer.
        // void output(string &key, int value);
        while (!input->done()) {
            vector<string> words = split(input->value().content, " ");
            for (string& word : words) {
                if (!word.empty())
                    output(word, 1);
            }
            input->next();
        }
    }

private:
    vector<string> split(const string& value, string delim) {
        vector<string> words;
        int last = 0, index;
        while ((index = value.find(delim, last)) != string::npos) {
            words.push_back(value.substr(last, index - last));
            last = index + delim.length();
        }
        if (last < value.length()) {
            words.push_back(value.substr(last, value.length() - last));
        }
        return words;
    }
};

struct Pair {
    Pair(string key, int times) {
        this->key = key;
        this->times = times;
    }
    string key;
    int    times;

    bool operator< (const Pair& other) const {
        return times > other.times || (times == other.times && key < other.key);
    }
};

class TopKFrequentWordsReducer: public Reducer {

private:
    int k;
    priority_queue<Pair> q;

public:
    void setUp(int k) {
        // initialize your data structure here
        this->k = k;
    }

    void Reduce(string &key, Input<int>* input) {
        // Write your code here  
        int sum = 0;
        while (!input->done()) {
            sum += input->value();
            input->next();
        }

        Pair pair(key, sum);
        if (q.size() < k) {
            q.push(pair);
        } else {
            q.push(pair);
            q.pop();
        }
    }

    void cleanUp() {
        // Please directly use func 'output' to output 
        // the top k pairs <word, times> into output buffer.
        // void output(string &key, int &value);

        vector<Pair> list;
        while (!q.empty()) {
            list.push_back(q.top());
            q.pop();
        }

        for (auto it = list.rbegin(); it != list.rend(); ++it) {
            output(it->key, it->times);
        }

        // for (int i = list.size() -1 ; i >= 0; --i) {
        //     output(list[i].key, list[i].times);
        // }
    }
};

results matching ""

    No results matching ""