Skip to content

Commit 02dab41

Browse files
drupolakondas
authored andcommitted
Provide a new NGramTokenizer with minGram and maxGram support (#350)
* Issue #349: Provide a new NGramTokenizer. * Issue #349: Add tests. * Fixes from code review. * Implement NGramTokenizer with min and max gram support * Add missing tests for ngram * Add info about NGramTokenizer to docs and readme * Add performance test for tokenization
1 parent b3fe9da commit 02dab41

File tree

8 files changed

+246
-27
lines changed

8 files changed

+246
-27
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
102102
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
103103
* Feature Extraction
104104
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
105+
* NGramTokenizer
106+
* WhitespaceTokenizer
107+
* WordTokenizer
105108
* [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/)
106109
* Dimensionality Reduction
107110
* PCA (Principal Component Analysis)

docs/machine-learning/feature-extraction/token-count-vectorizer.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,21 @@ $vectorizer->getVocabulary();
5353

5454
* WhitespaceTokenizer - select tokens by whitespace.
5555
* WordTokenizer - select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).
56+
* NGramTokenizer - continuous sequence of characters of the specified length. They are useful for querying languages that don’t use spaces or that have long compound words, like German.
57+
58+
**NGramTokenizer**
59+
60+
The NGramTokenizer tokenizer accepts the following parameters:
61+
62+
`$minGram` - minimum length of characters in a gram. Defaults to 1.
63+
`$maxGram` - maximum length of characters in a gram. Defaults to 2.
64+
65+
```php
66+
use Phpml\Tokenization\NGramTokenizer;
67+
68+
$tokenizer = new NGramTokenizer(1, 2);
69+
70+
$tokenizer->tokenize('Quick Fox');
71+
72+
// returns ['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox']
73+
```
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Phpml\Tokenization;
6+
7+
use Phpml\Exception\InvalidArgumentException;
8+
9+
class NGramTokenizer extends WordTokenizer
10+
{
11+
/**
12+
* @var int
13+
*/
14+
private $minGram;
15+
16+
/**
17+
* @var int
18+
*/
19+
private $maxGram;
20+
21+
public function __construct(int $minGram = 1, int $maxGram = 2)
22+
{
23+
if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
24+
throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
25+
}
26+
27+
$this->minGram = $minGram;
28+
$this->maxGram = $maxGram;
29+
}
30+
31+
/**
32+
* {@inheritdoc}
33+
*/
34+
public function tokenize(string $text): array
35+
{
36+
$words = [];
37+
preg_match_all('/\w\w+/u', $text, $words);
38+
39+
$nGrams = [];
40+
foreach ($words[0] as $word) {
41+
$this->generateNGrams($word, $nGrams);
42+
}
43+
44+
return $nGrams;
45+
}
46+
47+
private function generateNGrams(string $word, array &$nGrams): void
48+
{
49+
$length = mb_strlen($word);
50+
51+
for ($j = 1; $j <= $this->maxGram; $j++) {
52+
for ($k = 0; $k < $length - $j + 1; $k++) {
53+
if ($j >= $this->minGram) {
54+
$nGrams[] = mb_substr($word, $k, $j);
55+
}
56+
}
57+
}
58+
}
59+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Phpml\Tests\Performance\Tokenization;
6+
7+
use PhpBench\Benchmark\Metadata\Annotations\Iterations;
8+
use PhpBench\Benchmark\Metadata\Annotations\Revs;
9+
use Phpml\Tokenization\NGramTokenizer;
10+
11+
final class NGramTokenizerBench
12+
{
13+
/**
14+
* @Revs(1000)
15+
* @Iterations(5)
16+
*/
17+
public function benchSimpleTokenizer(): void
18+
{
19+
$tokenizer = new NGramTokenizer(2, 3);
20+
$tokenizer->tokenize(
21+
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent placerat blandit cursus. Suspendisse sed
22+
turpis sit amet enim viverra sodales a euismod est. Ut vitae tincidunt est. Proin venenatis placerat nunc
23+
sed ornare. Etiam feugiat, nisl nec sollicitudin sodales, nulla massa sollicitudin ipsum, vitae cursus ante
24+
velit vitae arcu. Vestibulum feugiat ultricies hendrerit. Morbi sed varius metus. Nam feugiat maximus
25+
turpis, a sollicitudin ligula porttitor eu.Fusce hendrerit tellus et dignissim sagittis. Nulla consectetur
26+
condimentum tortor, non bibendum erat lacinia eget. Integer vitae maximus tortor. Vestibulum ante ipsum
27+
primis in faucibus orci luctus et ultrices posuere cubilia Curae; Pellentesque suscipit sem ipsum, in
28+
tincidunt risus pellentesque vel. Nullam hendrerit consequat leo, in suscipit lectus euismod non. Cras arcu
29+
lacus, lacinia semper mauris vel, pharetra dignissim velit. Nam lacinia turpis a nibh bibendum, et
30+
placerat tellus accumsan. Sed tincidunt cursus nisi in laoreet. Suspendisse amet.'
31+
);
32+
}
33+
}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Phpml\Tests\Tokenization;
6+
7+
use Phpml\Exception\InvalidArgumentException;
8+
use Phpml\Tokenization\NGramTokenizer;
9+
10+
/**
11+
* Inspiration: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
12+
*/
13+
class NGramTokenizerTest extends TokenizerTest
14+
{
15+
/**
16+
* @dataProvider textDataProvider
17+
*/
18+
public function testNGramTokenization(int $minGram, int $maxGram, string $text, array $tokens): void
19+
{
20+
$tokenizer = new NGramTokenizer($minGram, $maxGram);
21+
22+
self::assertEquals($tokens, $tokenizer->tokenize($text));
23+
}
24+
25+
public function testMinGramGreaterThanMaxGramNotAllowed(): void
26+
{
27+
self::expectException(InvalidArgumentException::class);
28+
29+
new NGramTokenizer(5, 2);
30+
}
31+
32+
public function testMinGramValueTooSmall(): void
33+
{
34+
self::expectException(InvalidArgumentException::class);
35+
36+
new NGramTokenizer(0, 2);
37+
}
38+
39+
public function testMaxGramValueTooSmall(): void
40+
{
41+
self::expectException(InvalidArgumentException::class);
42+
43+
new NGramTokenizer(1, 0);
44+
}
45+
46+
public function textDataProvider(): array
47+
{
48+
return [
49+
[
50+
1, 2,
51+
'Quick Fox',
52+
['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox'],
53+
],
54+
[
55+
3, 3,
56+
'Quick Foxes',
57+
['Qui', 'uic', 'ick', 'Fox', 'oxe', 'xes'],
58+
],
59+
[
60+
1, 2,
61+
'快狐跑过 边缘跑',
62+
['', '', '', '', '快狐', '狐跑', '跑过', '', '', '', '边缘', '缘跑'],
63+
],
64+
[
65+
3, 3,
66+
'快狐跑过狐 边缘跑狐狐',
67+
['快狐跑', '狐跑过', '跑过狐', '边缘跑', '缘跑狐', '跑狐狐'],
68+
],
69+
[
70+
2, 4,
71+
$this->getSimpleText(),
72+
[
73+
'Lo', 'or', 're', 'em', 'Lor', 'ore', 'rem', 'Lore', 'orem', 'ip', 'ps', 'su', 'um', 'ips', 'psu', 'sum', 'ipsu',
74+
'psum', 'do', 'ol', 'lo', 'or', 'dol', 'olo', 'lor', 'dolo', 'olor', 'si', 'it', 'sit', 'am', 'me', 'et', 'ame',
75+
'met', 'amet', 'co', 'on', 'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte',
76+
'tet', 'etu', 'tur', 'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'ad', 'di', 'ip', 'pi', 'is',
77+
'sc', 'ci', 'in', 'ng', 'adi', 'dip', 'ipi', 'pis', 'isc', 'sci', 'cin', 'ing', 'adip', 'dipi', 'ipis', 'pisc',
78+
'isci', 'scin', 'cing', 'el', 'li', 'it', 'eli', 'lit', 'elit', 'Cr', 'ra', 'as', 'Cra', 'ras', 'Cras', 'co', 'on',
79+
'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte', 'tet', 'etu', 'tur',
80+
'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'du', 'ui', 'dui', 'et', 'lo', 'ob', 'bo', 'or',
81+
'rt', 'ti', 'is', 'lob', 'obo', 'bor', 'ort', 'rti', 'tis', 'lobo', 'obor', 'bort', 'orti', 'rtis', 'au', 'uc',
82+
'ct', 'to', 'or', 'auc', 'uct', 'cto', 'tor', 'auct', 'ucto', 'ctor', 'Nu', 'ul', 'll', 'la', 'Nul', 'ull', 'lla',
83+
'Null', 'ulla', 'vi', 'it', 'ta', 'ae', 'vit', 'ita', 'tae', 'vita', 'itae', 'co', 'on', 'ng', 'gu', 'ue', 'con',
84+
'ong', 'ngu', 'gue', 'cong', 'ongu', 'ngue', 'lo', 'or', 're', 'em', 'lor', 'ore', 'rem', 'lore', 'orem',
85+
],
86+
],
87+
[
88+
2, 4,
89+
$this->getUtf8Text(),
90+
[
91+
'鋍鞎', '鞮鞢', '鞢騉', '鞮鞢騉', '袟袘', '袘觕', '袟袘觕', '炟砏', '謺貙', '貙蹖', '謺貙蹖', '偢偣', '偣唲',
92+
'偢偣唲', '箷箯', '箯緷', '箷箯緷', '鑴鱱', '鱱爧', '鑴鱱爧', '覮轀', '剆坲', '煘煓', '煓瑐', '煘煓瑐', '鬐鶤',
93+
'鶤鶐', '鬐鶤鶐', '飹勫', '勫嫢', '飹勫嫢', '枲柊', '柊氠', '枲柊氠', '鍎鞚', '鞚韕', '鍎鞚韕', '焲犈', '殍涾',
94+
'涾烰', '殍涾烰', '齞齝', '齝囃', '齞齝囃', '蹅輶', '孻憵', '擙樲', '樲橚', '擙樲橚', '藒襓', '襓謥', '藒襓謥',
95+
'岯岪', '岪弨', '岯岪弨', '廞徲', '孻憵', '憵懥', '孻憵懥', '趡趛', '趛踠', '趡趛踠',
96+
],
97+
],
98+
];
99+
}
100+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Phpml\Tests\Tokenization;
6+
7+
use PHPUnit\Framework\TestCase;
8+
9+
abstract class TokenizerTest extends TestCase
10+
{
11+
public function getSimpleText(): string
12+
{
13+
return 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
14+
Cras consectetur, dui et lobortis;auctor.
15+
Nulla vitae ,.,/ congue lorem.';
16+
}
17+
18+
public function getUtf8Text(): string
19+
{
20+
return '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
21+
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
22+
殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
23+
}
24+
}

tests/Tokenization/WhitespaceTokenizerTest.php

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,37 +5,28 @@
55
namespace Phpml\Tests\Tokenization;
66

77
use Phpml\Tokenization\WhitespaceTokenizer;
8-
use PHPUnit\Framework\TestCase;
98

10-
class WhitespaceTokenizerTest extends TestCase
9+
class WhitespaceTokenizerTest extends TokenizerTest
1110
{
1211
public function testTokenizationOnAscii(): void
1312
{
1413
$tokenizer = new WhitespaceTokenizer();
1514

16-
$text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.
17-
Cras consectetur, dui et lobortis auctor.
18-
Nulla vitae congue lorem.';
15+
$tokens = ['Lorem', 'ipsum-dolor', 'sit', 'amet,', 'consectetur/adipiscing', 'elit.',
16+
'Cras', 'consectetur,', 'dui', 'et', 'lobortis;auctor.',
17+
'Nulla', 'vitae', ',.,/', 'congue', 'lorem.', ];
1918

20-
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.',
21-
'Cras', 'consectetur,', 'dui', 'et', 'lobortis', 'auctor.',
22-
'Nulla', 'vitae', 'congue', 'lorem.', ];
23-
24-
self::assertEquals($tokens, $tokenizer->tokenize($text));
19+
self::assertEquals($tokens, $tokenizer->tokenize($this->getSimpleText()));
2520
}
2621

2722
public function testTokenizationOnUtf8(): void
2823
{
2924
$tokenizer = new WhitespaceTokenizer();
3025

31-
$text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
32-
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
33-
殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
34-
3526
$tokens = ['鋍鞎', '', '鞮鞢騉', '袟袘觕,', '炟砏', '', '謺貙蹖', '偢偣唲', '', '箷箯緷', '鑴鱱爧', '覮轀,',
3627
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '', '', '枲柊氠', '鍎鞚韕', '焲犈,',
3728
'殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '', '廞徲', '孻憵懥', '趡趛踠', '', ];
3829

39-
self::assertEquals($tokens, $tokenizer->tokenize($text));
30+
self::assertEquals($tokens, $tokenizer->tokenize($this->getUtf8Text()));
4031
}
4132
}

tests/Tokenization/WordTokenizerTest.php

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,37 +5,28 @@
55
namespace Phpml\Tests\Tokenization;
66

77
use Phpml\Tokenization\WordTokenizer;
8-
use PHPUnit\Framework\TestCase;
98

10-
class WordTokenizerTest extends TestCase
9+
class WordTokenizerTest extends TokenizerTest
1110
{
1211
public function testTokenizationOnAscii(): void
1312
{
1413
$tokenizer = new WordTokenizer();
1514

16-
$text = 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
17-
Cras consectetur, dui et lobortis;auctor.
18-
Nulla vitae ,.,/ congue lorem.';
19-
2015
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit',
2116
'Cras', 'consectetur', 'dui', 'et', 'lobortis', 'auctor',
2217
'Nulla', 'vitae', 'congue', 'lorem', ];
2318

24-
self::assertEquals($tokens, $tokenizer->tokenize($text));
19+
self::assertEquals($tokens, $tokenizer->tokenize($this->getSimpleText()));
2520
}
2621

2722
public function testTokenizationOnUtf8(): void
2823
{
2924
$tokenizer = new WordTokenizer();
3025

31-
$text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
32-
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
33-
殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
34-
3526
$tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀',
3627
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈',
3728
'殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ];
3829

39-
self::assertEquals($tokens, $tokenizer->tokenize($text));
30+
self::assertEquals($tokens, $tokenizer->tokenize($this->getUtf8Text()));
4031
}
4132
}

0 commit comments

Comments
 (0)