小規模データで単語の数を数えてみた (1)

大規模データで単語の数を数える - ny23の日記で書いた Count-Min Sketch で，誤差を減らすヒューリスティクス (conservative update)

New directions in traffic measurement and accounting (SIGCOMM Comput. Commun. Rev., 32(4), 2002)

を実装して，動的ダブル配列を使って Wikipedia のテキスト処理を高速化 - ny23の日記の小規模データ（1.5GiB の Wikipedia 本文）の単語カウントでその効果を見てみた．考えるところはハッシュ関数に何を使うかぐらいで（キーを陽に保持しない限りは）実装はとても簡単．

// GNU GPL version 2 copyright@ny23
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cmath>
#include <new>

typedef unsigned int   uint;
typedef unsigned char  uchar;

class general_hash { // pairwise-indendendent hash family
public:
  general_hash (const uint mask) : _h1 (), _mask (mask) {
    for (uint i = 0; i < 256; ++i)
      _h1[i] = _rng (_mask);
  }
  uint operator () (const char * p) const {
    uint h = 0;
    while (const uchar c = static_cast <uchar> (*p)) {
      ++p;
      if ((h <<= 1) >> 31) h ^= _irreducible_poly31 ^ _h1[c]; else h ^= _h1[c];
    }
    return h & _mask;
  }
private:
  static struct { // Xorshift RNG; http://www.jstatsoft.org/v08/i14/paper
    size_t gen () {
      static size_t x (123456789), y (362436069), z (521288629), w (88675123);
      size_t t = (x ^ (x << 11)); x = y; y = z; z = w;
      return w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
    }
    size_t operator () (size_t mask) { return gen () & mask; } // % max
  } _rng;
  static const uint _irreducible_poly31 = (1 << 31) + (1 << 3) + 1;
  uint _h1[256];
  uint _mask;
};

template <typename T>
class cmin_base {
public:
  void update (const char * key) {
    ++_n;
    static_cast <T*> (this)->update_impl (key);
  }
  size_t draw_count (const char * const key) const {
    size_t ret = _sketch[_hash_fun[0] (key)];
    for (size_t * s (&_sketch[_w]), i (1); i < _d; ++i, s += _w) {
      const size_t &cnt = s[_hash_fun[i] (key)];
      if (cnt < ret) ret = cnt;
    }
    return ret;
  }
private:
  // sketch parameters
  const double         _epsilon;  // \epsilon-approximation
  const double         _delta;    // error probability
  size_t               _n;        // # tokens
  // uncopyable
  cmin_base (const cmin_base&);
  cmin_base& operator= (const cmin_base&);
protected:
  const uint           _d;        // # counters (# hash functions)
  const uint           _w;        // counter size = 2^n to enable quick modulo
  general_hash * const _hash_fun;
  size_t * const       _sketch;
  //
  cmin_base (double epsilon, double delta) :
    _epsilon (epsilon), _delta (delta), _n (0), _d (std::ceil (std::log (1.0 / _delta))), _w (1 << static_cast <uint> (std::ceil (std::log (std::exp (1.0) / _epsilon) / std::log (2)))), _hash_fun (static_cast <general_hash *>(::operator new (_d * sizeof (general_hash)))), _sketch (new size_t[_d * _w]) {
    std::fprintf (stderr, "%f is adjusted to %f to enable quick modulo.\n",
                  _delta, std::exp (1.0) / static_cast <double> (_w));
    for (uint i = 0; i < _d; ++i)
      new (&_hash_fun[i]) general_hash (_w - 1);
    std::memset (_sketch, 0, _d * _w * sizeof (size_t));
  }
  ~cmin_base () {
    std::fprintf (stderr, "d(%d) * w(%d) = %d cells for %ld tokens.\n",
                  _d, _w, _d * _w, _n);
    ::operator delete (_hash_fun); // ignoring destructor
    delete [] _sketch;
  }
};

// count min sketch; http://sites.google.com/site/countminsketch/
class cmin : public cmin_base <cmin> {
public:
  cmin (double epsilon, double delta) : cmin_base <cmin> (epsilon, delta) {}
  void update_impl (const char * key) {
    for (size_t *s (&_sketch[0]), i (0); i < _d; ++i, s += _w)
      ++s[_hash_fun[i] (key)];
  }
};

// count min sketch with conservative update
class cmin_cu : public cmin_base <cmin_cu> {
public:
  cmin_cu (double epsilon, double delta) : cmin_base <cmin_cu> (epsilon, delta), _slot (new size_t*[_d] ()) {}
  ~cmin_cu () { delete [] _slot; }
  void update_impl (const char * key) {
    size_t * min = &_sketch[_hash_fun[0] (key)];
    size_t nslot = 0;
    _slot[nslot] = min; ++nslot;
    for (size_t * s (&_sketch[_w]), i (1); i < _d; ++i, s += _w) {
      size_t * const slot = &s[_hash_fun[i] (key)];
      if (*slot > *min) continue; // skip
      if (*slot < *min) { min = slot; nslot = 0; } // flush
      _slot[nslot] = slot; ++nslot;
    }
    for (size_t i = 0; i < nslot; ++i) ++(*_slot[i]);
  }
private:
  size_t ** _slot;  // min slots for conservative update
};

int main (int argc, char** argv) {

  if (argc < 2) {
    std::fprintf (stderr, "Usage: %s file epsilon delta [ref_cnt]\n", argv[0]);
    std::exit (1);
  }
 
  FILE * fp = std::fopen (argv[1], "r");
  if (! fp)
    { std::fprintf (stderr, "no such file: %s\n", argv[1]); std::exit (1); }
  char buf[1 << 18]; std::setvbuf (fp, &buf[0], _IOFBF, 1 << 18);

  // cmin cm (std::strtod (argv[2], NULL), std::strtod (argv[3], NULL)); // CM
  cmin_cu cm (std::strtod (argv[2], NULL), std::strtod (argv[3], NULL)); // CM+cu

  // monitor items
  char * line = 0;
#ifdef __APPLE__
  size_t read = 0;
  while ((line = fgetln (fp, &read)) != NULL) {
#else
  ssize_t read = 0;
  size_t  size = 0;
  while ((read = getline (&line, &size, fp)) != -1) {
#endif
    * (line + read - 1) = '\0';
    cm.update (line);
  }
  std::fclose (fp);
  
  if (argc == 4) return 0;
  
  if (! (fp = std::fopen (argv[4], "r")))
    { std::fprintf (stderr, "no such file: %s\n", argv[4]); std::exit (1); }
#ifdef __APPLE__
  while ((line = fgetln (fp, &read)) != NULL) {
#else
  while ((read = getline (&line, &size, fp)) != -1) {
#endif
    char * key   = std::strtok (line, " \n");
    size_t count = std::strtol (std::strtok (NULL, " \n"), NULL, 10);
    
    // draw an overestimated count from sketch
    size_t ret = cm.draw_count (key);
    std::fprintf (stdout, "%s %ld %ld\n", key, ret, count);
  }
  std::fclose (fp);
  
  return 0;
}

Count-Min Sketch で単語頻度を計測後，exact な頻度を参照して比較するコード．sketch として利用する場合は，ハッシュ関数とカウンタを保存すれば ok．
というわけで，δ (error probability) を 0.001 (カウンタの数 d=7) に固定し，のべ item 数 N (=279,024,968; item の異なり数は 1,594,517) に対する絶対誤差 ε を 0.01 -> 0.0001 と変えて（実際はプログラムの都合上高コストな除算を避けるため，カウンタのサイズ w が 2^n になるように ε を調整），exact な頻度と比較してみた．

> run ./hash_counter4*1 unigram_raw.txt | LC_ALL=C sort -nr -k 2 > exact.list
> run ./count_min unigram_raw.txt 0.1 0.001 [exact.list]

こんな感じで実行．exact.list は，ハッシュベースのカウンタの出力を頻度順に並べたもの（時間・メモリ計測時は略）．

// compiled with gcc 4.6.0 20101113 (-O2 -march=core2 -m64)
CM:    Count-Min Sketch
CM+cu: Count-Min Sketch + conservative update 
exact: trivial hash-based counter
// -> は exact な頻度が得られたことを示す

































































































































































-
















































































            |   CM    |  CM+cu  |   CM    |  CM+cu  |   CM    |  CM+cu  |  exact
ε           |        0.01       |        0.001      |        0.0001     |
ε'(adjusted)|        0.00539    |        0.000664   |        0.000083   |
Nε'         |  1503945          |   185273          |    23159          |
d x w       |       7 x 2^9     |       7 x 2^12    |       7 x 2^15    |

































































































































































-
















































































memory [MiB]|   0.8   |   0.8   |   1.0   |   1.0   |   2.5   |   2.5   | 106.5
time   [s]  |  23.99  |  33.89  |  24.94  |  34.32  |  27.12  |  32.50  |  28.85

































































































































































-
















































































Rank 1-5     
の　　　　　　| 13529343|       ->| 13421521|       ->| 13406781|       ->| 13406476
、　　　　　　| 12577998|       ->| 12474298|       ->| 12466374|       ->| 12466310
。　　　　　　|  9423857|       ->|  9257520|       ->|  9253389|       ->|  9253307
に　　　　　　|  8447663|       ->|  8313843|       ->|  8297252|       ->|  8296578
は　　　　　　|  7999402|       ->|  7898323|       ->|  7892070|       ->|  7891964
Rank 10^2+1-5
において　　　|   319020|       ->|   183919|       ->|   179038|       ->|   178891
名　　　　　　|   361503|   178166|   187748|       ->|   178499|       ->|   178165
上　　　　　　|   424109|       ->|   192295|       ->|   178495|       ->|   178124
町　　　　　　|   306655|   178302|   187704|       ->|   177784|       ->|   177584
性　　　　　　|   380416|       ->|   188870|       ->|   177864|       ->|   177481
Rank 10^4+1-5
武蔵野　　　　|   149276|    83188|    12595|     5132|     2096|       ->|     1818
文字通り　　　|   127537|    83187|     7112|     4911|     1955|       ->|     1818
基部　　　　　|   144208|    83194|     7705|     5025|     2133|       ->|     1818
前部　　　　　|   211623|    83189|    22166|     6778|     1962|       ->|     1818
優等　　　　　|   195920|    83195|    20857|     9562|     2283|       ->|     1818
Rank 10^6+1-5
ドゥカーテ・・|   129037|    83186|     5457|     3922|      213|      162|        1
ドゥカー　　　|   108091|    83185|    12503|     5070|      153|      135|        1
ドゥカル　　　|   215201|    83188|     5629|     4620|      277|      197|        1
ドゥカドス　　|   182301|    83195|    10460|     5401|      252|      158|        1
ドゥカド　　　|   103954|    83166|     9071|     5004|      724|      283|        1

































































































































































-

というわけで，大規模とはとても言えないデータながら， conservative update の効果は確認できた．（同じ ε の）Count-Min Sketch に対して，conservative update を行うと低頻度語について誤差が約半分程度に収まっているのに加え，高頻度語に対して頻度の正確な見積りができるようになっている．また，単語の頻度分布が power law に従うおかげで，Nε’より実際の誤差が抑えられている点も注目に値する．
また，キーのためにメモリを動的に確保しないおかげで，conservative update を行わない場合は，ハッシュベースのカウンタより頻度の計測が高速になっている*2．取り急ぎ適当に書いたコードなので，最適化の余地はまだまだあるが*3，sketch のまま保存して使う限り，使い勝手は非常に良さそうだ (ちなみに未登録語に対する頻度は，上記の表で，一番下のランクのものと同じぐらい)．

*1:ただし，char buf[1 << 18]; std::setvbuf (fp, &buf[0], _IOFBF, 1 << 18); を追加．

*2:error probability δ に対してカウンタの数は ln 1/δ 必要なため，δ を小さくするに従い更新速度は低下する．なお，絶対誤差の割合 ε を減らしても更新するカウンタの数は変わらないので，（キャッシュミスが起きやすくなることを除けば）更新速度に大きな影響はない．

*3:IO がボトルネックなので，行読み込みを read + 自前 buffer にするとか，高頻度の単語について，ハッシュ関数の計算結果をキャッシュする，あるいは（conservative update だと結果に影響する可能性があるが）exact なカウンタを保持して，最後にまとめて更新することにしてハッシュ関数の計算を減らすとか．