1: <?php
2: /**
3: * Sastrawi (https://github.com/sastrawi/sastrawi)
4: *
5: * @link http://github.com/sastrawi/sastrawi for the canonical source repository
6: * @license https://github.com/sastrawi/sastrawi/blob/master/LICENSE The MIT License (MIT)
7: */
8:
9: namespace Sastrawi\Stemmer;
10:
11: use Sastrawi\Dictionary\DictionaryInterface;
12: use Sastrawi\Stemmer\Context\Context;
13: use Sastrawi\Stemmer\Context\Visitor\VisitorProvider;
14:
15: /**
16: * Indonesian Stemmer.
17: * Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS.
18: *
19: * @link https://github.com/sastrawi/sastrawi/wiki/Resources
20: */
21: class Stemmer implements StemmerInterface
22: {
23: /**
24: * The dictionary containing root words
25: *
26: * @var \Sastrawi\Dictionary\DictionaryInterface
27: */
28: protected $dictionary;
29:
30: /**
31: * Visitor provider
32: *
33: * @var \Sastrawi\Stemmer\Context\Visitor\VisitorProvider
34: */
35: protected $visitorProvider;
36:
37: public function __construct(DictionaryInterface $dictionary)
38: {
39: $this->dictionary = $dictionary;
40: $this->visitorProvider = new VisitorProvider;
41: }
42:
43: /**
44: * @return \Sastrawi\Dictionary\DictionaryInterface
45: */
46: public function getDictionary()
47: {
48: return $this->dictionary;
49: }
50:
51: /**
52: * Stem a text string to its common stem form.
53: *
54: * @param string $text the text string to stem, e.g : memberdayakan pembangunan
55: * @return string common stem form, e.g : daya bangun
56: */
57: public function stem($text)
58: {
59: $normalizedText = Filter\TextNormalizer::normalizeText($text);
60:
61: $words = explode(' ', $normalizedText);
62: $stems = array();
63:
64: foreach ($words as $word) {
65: $stems[] = $this->stemWord($word);
66: }
67:
68: return implode(' ', $stems);
69: }
70:
71: /**
72: * Stem a word to its common stem form.
73: *
74: * @param string $word the word to stem, e.g : memberdayakan
75: * @return string common stem form, e.g : daya
76: */
77: protected function stemWord($word)
78: {
79: if ($this->isPlural($word)) {
80: return $this->stemPluralWord($word);
81: } else {
82: return $this->stemSingularWord($word);
83: }
84: }
85:
86: /**
87: * @param string $word
88: * @return boolean
89: */
90: protected function isPlural($word)
91: {
92: return strpos($word, '-') !== false;
93: }
94:
95: /**
96: * Stem a plural word to its common stem form.
97: * Asian J. (2007) “Effective Techniques for Indonesian Text Retrieval” page 76-77.
98: *
99: * @param string $plural the word to stem, e.g : bersama-sama
100: * @return string common stem form, e.g : sama
101: * @link http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
102: */
103: protected function stemPluralWord($plural)
104: {
105: preg_match('/^(.*)-(.*)$/', $plural, $words);
106:
107: if (!isset($words[1]) || !isset($words[2])) {
108: return $plural;
109: }
110:
111: $rootWord1 = $this->stemSingularWord($words[1]);
112: $rootWord2 = $this->stemSingularWord($words[2]);
113:
114: if ($rootWord1 == $rootWord2) {
115: return $rootWord1;
116: } else {
117: return $plural;
118: }
119: }
120:
121: /**
122: * Stem a singular word to its common stem form.
123: *
124: * @param string $word the word to stem, e.g : mengalahkan
125: * @return string common stem form, e.g : kalah
126: */
127: protected function stemSingularWord($word)
128: {
129: $context = new Context($word, $this->dictionary, $this->visitorProvider);
130: $context->execute();
131:
132: return $context->getResult();
133: }
134: }
135: