TokenTools.java
package eu.javaexperience.text;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import eu.javaexperience.interfaces.simple.publish.SimplePublish2;
import eu.javaexperience.reflect.Mirror;
public class TokenTools
{
public static class KeywordGroupIndex
{
public Map<String, Integer> tokenFrequency = new HashMap<>();
public Map<String, String[]> stringTokens = new HashMap<>();
}
public static KeywordGroupIndex indexTokens
(
Collection<String> strings,
SimplePublish2<String, Collection<String>> tokenize,
boolean ignoreDuplicateTokensInString
)
{
Collection<String> tokens = ignoreDuplicateTokensInString?
new HashSet<String>()
:
new ArrayList<String>();
KeywordGroupIndex ret = new KeywordGroupIndex();
for(String s:strings)
{
tokens.clear();
tokenize.publish(s, tokens);
ret.stringTokens.put(s, tokens.toArray(Mirror.emptyStringArray));
for(String t:tokens)
{
Integer i = ret.tokenFrequency.get(t);
if(null == i)
{
i = 1;
}
else
{
++i;
}
ret.tokenFrequency.put(t, i);
}
}
return ret;
}
}