Quick Start

Web (Browser) -- OPFS Dictionary Loading

The recommended approach is to download dictionaries at runtime using the OPFS helpers:

import __wbg_init, { TokenizerBuilder, loadDictionaryFromBytes } from 'lindera-wasm-web';
import { downloadDictionary, loadDictionaryFiles, hasDictionary } from 'lindera-wasm-web/opfs';

async function main() {
    await __wbg_init();

    // Download dictionary if not cached
    if (!await hasDictionary("ipadic")) {
        await downloadDictionary(
            "https://github.com/lindera/lindera/releases/download/<version>/lindera-ipadic-<version>.zip",
            "ipadic",
        );
    }

    // Load dictionary from OPFS
    const files = await loadDictionaryFiles("ipadic");
    const dictionary = loadDictionaryFromBytes(
        files.metadata, files.dictDa, files.dictVals, files.dictWordsIdx,
        files.dictWords, files.matrixMtx, files.charDef, files.unk,
    );

    // Build tokenizer
    const builder = new TokenizerBuilder();
    builder.setDictionaryInstance(dictionary);
    builder.setMode("normal");
    const tokenizer = builder.build();

    const tokens = tokenizer.tokenize("関西国際空港限定トートバッグ");
    tokens.forEach(token => {
        console.log(`${token.surface}\t${token.details.join(',')}`);
    });
}

main();

Note: Download a pre-built dictionary from GitHub Releases. See OPFS Dictionary Storage for the full workflow.

Expected output:

関西国際空港    名詞,固有名詞,一般,*,*,*,関西国際空港,カンサイコクサイクウコウ,カンサイコクサイクーコー
限定    名詞,サ変接続,*,*,*,*,限定,ゲンテイ,ゲンテイ
トートバッグ    名詞,一般,*,*,*,*,*,*,*

Using Embedded Dictionaries (Advanced)

If you built with an embed-* feature flag, you can use embedded dictionaries:

import __wbg_init, { TokenizerBuilder } from 'lindera-wasm-web-ipadic';

async function main() {
    await __wbg_init();

    const builder = new TokenizerBuilder();
    builder.setDictionary("embedded://ipadic");
    builder.setMode("normal");
    const tokenizer = builder.build();

    const tokens = tokenizer.tokenize("関西国際空港限定トートバッグ");
    tokens.forEach(token => {
        console.log(`${token.surface}\t${token.details.join(',')}`);
    });
}

main();

Using Filters

You can add character filters and token filters to the tokenization pipeline:

import __wbg_init, { TokenizerBuilder, loadDictionaryFromBytes } from 'lindera-wasm-web';
import { loadDictionaryFiles } from 'lindera-wasm-web/opfs';

async function main() {
    await __wbg_init();

    // Assume dictionary is already cached in OPFS
    const files = await loadDictionaryFiles("ipadic");
    const dictionary = loadDictionaryFromBytes(
        files.metadata, files.dictDa, files.dictVals, files.dictWordsIdx,
        files.dictWords, files.matrixMtx, files.charDef, files.unk,
    );

    const builder = new TokenizerBuilder();
    builder.setDictionaryInstance(dictionary);
    builder.setMode("normal");

    // Add Unicode NFKC normalization
    builder.appendCharacterFilter("unicode_normalize", { kind: "nfkc" });

    // Add a stop-tags filter to remove particles and auxiliary verbs
    builder.appendTokenFilter("japanese_stop_tags", {
        tags: ["助詞", "助動詞"]
    });

    const tokenizer = builder.build();
    const tokens = tokenizer.tokenize("Linderaは形態素解析エンジンです");
    tokens.forEach(token => {
        console.log(`${token.surface}\t${token.details.join(',')}`);
    });
}

main();

N-Best Tokenization

Retrieve multiple tokenization candidates ranked by cost:

const results = tokenizer.tokenizeNbest("すもももももももものうち", 3);
results.forEach((result, rank) => {
    console.log(`--- NBEST ${rank + 1} (cost=${result.cost}) ---`);
    result.tokens.forEach(token => {
        console.log(`${token.surface}\t${token.details.join(',')}`);
    });
});