HTML Chunker (C Library 版)

興味もあったので、Ruby の拡張ライブラリとして C でパーザを書いてみた。いろいろいい加減。

#include
#include
#include

static VALUE c_parse_html(obj, str)
    VALUE obj, str;
{
    StringValue(str);
    VALUE ary = rb_ary_new();
    bool tag = false;
    char* bp = RSTRING(str)->ptr;
    char* eos = bp + RSTRING(str)->len;
    char* ep;
    while ( ( ep = strstr( bp, tag ? ">" : "<" ) ) != NULL) {
        if (tag) ep++;
        if (ep > bp) {
            VALUE item = rb_ary_new3(2, tag, rb_str_new(bp, ep - bp));
            rb_ary_push(ary, item);
        }
        tag = ! tag;
        bp = ep;
    }
    if (eos > bp) {
        VALUE item = rb_ary_new3(2, tag, rb_str_new2(bp));
        rb_ary_push(ary, item); 
    }
    return ary;
}

void Init_parse_html()
{
    rb_define_global_function("parse_html", c_parse_html, 1);
}

ベンチマークを取ってみると、速くなっている。C 版の c_parse_html と ruby 版の rb_parse_html はアルゴリズムは一緒だが5倍程度速い。scan と比べても速いようだ。

c 1000              1.920000   0.010000   1.930000 (  1.918048)
rb 1000             9.530000   0.010000   9.540000 (  9.554881)
rb scan 1000        6.870000   0.000000   6.870000 (  6.879216)