Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Database#createFTS5Tokenizer API #944

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 46 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- [Database#aggregate()](#aggregatename-options---this)
- [Database#table()](#tablename-definition---this)
- [Database#loadExtension()](#loadextensionpath-entrypoint---this)
- [Database#createFTS5Tokenizer()](#createfts5tokenizername-factory---this)
- [Database#exec()](#execstring---this)
- [Database#close()](#close---this)
- [Properties](#properties)
Expand Down Expand Up @@ -372,6 +373,51 @@ It's your responsibility to make sure the extensions you load are compiled/linke
db.loadExtension('./my-extensions/compress.so');
```

### .createFTS5Tokenizer(*name*, *factory*) -> *this*

Creates a custom JavaScript-based tokenizer for the [FTS5](https://www.sqlite.org/fts5.html#tokenizers).

One of the main use cases for such tokenizer would be to add support for CJK symbols and non-latin locales for FTS5. As an example, this could be done with `Intl.Segmenter` API:
```js
db.createFTS5Tokenizer('js_tokenizer', class Tokenizer {
constructor(params) {
// params will be ["param1", "param2"]
}

run(str) {
const result = [];
let off = 0;
for (const seg of segmenter.segment(str)) {
const len = Buffer.byteLength(seg.segment);
if (seg.isWordLike) {
// Remove diacritic symbols
const normalized = seg.segment.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '');
result.push(
// Segment start byte offset
off,
// Segment end byte offset
off + len,
// Either normalized segment or a `null` (optimization)
normalized === seg.segment ? null : normalized,
);
}
off += len;
}
return result;
}
});

db.exec(`
CREATE VIRTUAL TABLE fts_table USING fts5(
body,
tokenize='js_tokenizer param1 param2'
);

INSERT INTO fts_table(body) VALUES ('hello world');
`);
```

### .exec(*string*) -> *this*

Executes the given SQL string. Unlike [prepared statements](#preparestring---statement), this can execute strings that contain multiple SQL statements. This function performs worse and is less safe than using [prepared statements](#preparestring---statement). You should only use this method when you need to execute SQL from an external source (usually a file). If an error occurs, execution stops and further statements are not executed. You must rollback changes manually.
Expand Down
3 changes: 3 additions & 0 deletions lib/database.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ function Database(filenameGiven, options) {
});
}

function noop() {}

const wrappers = require('./methods/wrappers');
Database.prototype.prepare = wrappers.prepare;
Database.prototype.transaction = require('./methods/transaction');
Expand All @@ -74,6 +76,7 @@ Database.prototype.serialize = require('./methods/serialize');
Database.prototype.function = require('./methods/function');
Database.prototype.aggregate = require('./methods/aggregate');
Database.prototype.table = require('./methods/table');
Database.prototype.createFTS5Tokenizer = require('./methods/createFTS5Tokenizer');
Database.prototype.loadExtension = wrappers.loadExtension;
Database.prototype.exec = wrappers.exec;
Database.prototype.close = wrappers.close;
Expand Down
24 changes: 24 additions & 0 deletions lib/methods/createFTS5Tokenizer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
'use strict';
const { cppdb } = require('../util');

module.exports = function createFTS5Tokenizer(name, factory) {
// Validate arguments
if (typeof name !== 'string') throw new TypeError('Expected first argument to be a string');
if (!name) throw new TypeError('Virtual table module name cannot be an empty string');
if (typeof factory !== 'function') throw new TypeError('Expected second argument to be a constructor');

this[cppdb].createFTS5Tokenizer(name, function create(params) {
const instance = new factory(params);

function run(str) {
if (!instance.run) {
// This will throw in C++
return;
}
return instance.run(str);
}

return run;
});
return this;
};
302 changes: 239 additions & 63 deletions src/better_sqlite3.cpp

Large diffs are not rendered by default.