//package ca.carleton.scs.text;
/**
* Porter's algorithm to canonicalize an English word. The
* algorithm removes suffix morphemes for plurals, participles, etc.
*
* This implementation is adapted from the CNIDR freeWAIS and
* Harvest implementations of Porter's algorithm.
*
* @author Darcy Quesnel
* @version 1998 February
*/
public class Porter {
/* Plurals. */
private static String step1aRules[][] = {
{"sses", "ss"}, {"ies", "i"}, {"ss", "ss"}, {"s", ""} };
/* Participles 0. */
private static String step1b0Rules[][] = {
{"eed", "ee"} };
/* Participles 1. */
private static String step1b1Rules[][] = {
{"ed", ""}, {"ing", ""} };
/* Participles 2. */
private static String step1b2Rules[][] = {
{"at", "ate"}, {"bl", "ble"}, {"is", "ise"}, {"iz", "ize"},
{"bb", "b"}, {"dd", "d"}, {"ff", "f"}, {"gg", "g"},
{"mm", "m"}, {"nn", "n"}, {"pp", "p"}, {"rr", "r"},
{"tt", "t"}, {"ww", "w"}, {"xx", "x"} };
/* Participles 3. */
private static String step1b3Rules[][] = {
{"", "e"} };
/* Change some y's to i's. */
private static String step1cRules[][] = {
{"y", "i"} };
/* Double and triple suffices. */
private static String step2Rules[][] = {
{"ational", "ate"}, {"tional", "tion"}, {"enci", "ence"},
{"anci", "ance"}, {"iser", "ise"}, {"izer", "ize"},
{"abli", "able"}, {"alli", "al"}, {"entli", "ent"},
{"eli", "e"}, {"ousli", "ous"}, {"isation", "ise"},
{"ization", "ize"}, {"ation", "ate"}, {"ator", "ate"},
{"alism", "al"}, {"iveness", "ive"}, {"fulness", "ful"},
{"ousness", "ous"}, {"aliti", "al"}, {"iviti", "ive"},
{"biliti", "ble"} };
/* More double and triple suffices. */
private static String step3Rules[][] = {
{"icate", "ic"}, {"ative", ""}, {"alise", "al"}, {"alize", "al"},
{"iciti", "ic"}, {"ful", ""}, {"ness", ""} };
/* Single suffices on polysyllables. */
private static String step4Rules[][] = {
{"al", ""}, {"ance", ""}, {"ence", ""}, {"er", ""}, {"ic", ""},
{"able", ""}, {"ible", ""}, {"ant", ""}, {"ement", ""},
{"ment", ""}, {"ent", ""}, {"sion", "s"}, {"tion", "t"},
{"ou", ""}, {"ism", ""}, {"ate", ""}, {"iti", ""}, {"ous", ""},
{"ive", ""}, {"ise", ""}, {"ize", ""} };
/* Remove some final e's. */
private static String step5aRules[][] = {
{"e", ""} };
/* Remove some final double l's. */
private static String step5bRules[][] = {
{"ll", "l"} };
/* Class not to be instantiated. */
private Porter() {
}
/**
* Counts the number of syllables in a word. Disregards an initial
* consonant and a trailing vowel.
*
* @param word the String
whose syllables are counted
* @return the number of syllables in the given String
*/
public static int countSyllables(String word) {
int syllables = 0;
for (int i = 0; i < word.length() - 1; i++) {
if ( word.charAt(i) == 'a'
|| word.charAt(i) == 'e'
|| word.charAt(i) == 'i'
|| word.charAt(i) == 'o'
|| word.charAt(i) == 'u'
|| word.charAt(i) == 'y' ) {
if ( word.charAt(i+1) != 'a'
&& word.charAt(i+1) != 'e'
&& word.charAt(i+1) != 'i'
&& word.charAt(i+1) != 'o'
&& word.charAt(i+1) != 'u'
&& word.charAt(i+1) != 'y' ) {
syllables += 1;
}
}
}
return syllables;
}
/**
* Recognizes a vowel in a given word.
*
* @param word the String
in which to search for a vowel
* @return whether the given String
contains a vowel
*/
public static boolean containsVowel(String word) {
for (int i = 0; i < word.length(); i++) {
if ( word.charAt(i) == 'a'
|| word.charAt(i) == 'e'
|| word.charAt(i) == 'i'
|| word.charAt(i) == 'o'
|| word.charAt(i) == 'u' ) {
return true;
} else if (i+1 < word.length()) {
if (word.charAt(i+1) == 'y') {
return true;
} else if (word.charAt(i) == 'y') {
if ( word.charAt(i+1) != 'a'
&& word.charAt(i+1) != 'e'
&& word.charAt(i+1) != 'i'
&& word.charAt(i+1) != 'o'
&& word.charAt(i+1) != 'u'
&& word.charAt(i+1) != 'y') {
return true;
}
}
}
}
return false;
}
/**
* Recognizes a trailing consonant-vowel-consonant (CVD). The
* last consonant doesn't include w
, x
, or
* y
.
*
* @param word the String
in which to search for CVD
* @return whether the given String
contains CVD
*/
public static boolean containsCVD(String word) {
if ( !word.endsWith("a") && !word.endsWith("e")
&& !word.endsWith("i") && !word.endsWith("o")
&& !word.endsWith("u") && !word.endsWith("w")
&& !word.endsWith("x") && !word.endsWith("y") ) {
word = word.substring(0, word.length()-1);
if ( word.endsWith("a") || word.endsWith("e")
|| word.endsWith("i") || word.endsWith("o")
|| word.endsWith("u") || word.endsWith("y") ) {
while ( word.endsWith("a") || word.endsWith("e")
|| word.endsWith("i") || word.endsWith("o")
|| word.endsWith("u") || word.endsWith("y") ) {
word = word.substring(0, word.length()-1);
}
if ( !word.endsWith("a") && !word.endsWith("e")
&& !word.endsWith("i") && !word.endsWith("o")
&& !word.endsWith("u") ) {
return true;
}
}
}
return false;
}
/**
* Applies the first appropriate rule to the given word.
*
* @param word the word to be transformed
* @param rules the rules with which to transform the word
* @return the transformed word
*/
protected static String replaceEnd(String word, String rules[][]) {
return replaceEnd(word, rules, -1);
}
/**
* Applies the first appropriate rule to the given word. Ensures
* the stem has at least the given number of syllables before
* applying.
*
* @param word the word to be transformed
* @param rules the rules with which to transform the word
* @param syllables the number of syllables the word must have
* @return the transformed word
*/
protected static String replaceEnd(String word, String rules[][],
int syllables) {
String stem;
for (int i = 0; i < rules.length; i++) {
if (word.endsWith(rules[i][0])) {
stem = word.substring( 0,
word.length() - rules[i][0].length() );
if (syllables <= countSyllables(stem)) {
return stem + rules[i][1];
} else {
return word;
}
}
}
return word;
}
/**
* Canonicalizes an English word. (This is the main exported method of
* this class.)
*
* @param word the String
to be canonicalized
* @return the canonical version of the the given String
*/
public static String canonicalize(String word) {
// The steps are numbered similar to previous implementations
// and, apparently, Porter's original article.
// Step 1a
if (word.endsWith("s")) {
word = replaceEnd(word, step1aRules);
}
// Step 1b
if (word.endsWith("eed")) {
word = replaceEnd(word, step1b0Rules, 1);
} else if (word.endsWith("ed") || word.endsWith("ing")) {
word = replaceEnd(word, step1b1Rules);
word = replaceEnd(word, step1b2Rules);
if (containsCVD(word)) {
word = replaceEnd(word, step1b3Rules);
}
}
// Step 1c
if ( word.endsWith("y") &&
containsVowel(word.substring(0, word.length()-1)) ) {
word = replaceEnd(word, step1cRules);
}
// Step 2
word = replaceEnd(word, step2Rules, 1);
// Step 3
word = replaceEnd(word, step3Rules, 1);
// Step 4
word = replaceEnd(word, step4Rules, 2);
word = replaceEnd(word, step4Rules, 2);
// Step 5a
if (word.endsWith("e")) {
word = replaceEnd(word, step5aRules, 2);
if ( word.endsWith("e")
&& containsCVD(word.substring(0, word.length()-1)) ) {
word = replaceEnd(word, step5aRules);
}
}
// Step 5b
if (word.endsWith("ll")) {
word = replaceEnd(word, step5bRules, 1);
}
return word;
}
}