There are various ways of parsing text. The usual tools are:
- String.split methods
- StringTokenizer and StreamTokenizer classes
- Scanner class
- Pattern and Matcher classes, which implement regular expressions
- for the most complex parsing tasks, you can use tools such as JavaCC
This example uses Scanner. Here, the contents of a file containing name-value
pairs is read, and each line is parsed into its constituent data.
import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Scanner; /** Assumes UTF-8 encoding. JDK 7+. */ public class ReadWithScanner { public static void main(String... aArgs) throws IOException { ReadWithScanner parser = new ReadWithScanner("C:\\Temp\\test.txt"); parser.processLineByLine(); log("Done."); } /** Constructor. @param aFileName full name of an existing, readable file. */ public ReadWithScanner(String aFileName){ fFilePath = Paths.get(aFileName); } /** Template method that calls {@link #processLine(String)}. */ public final void processLineByLine() throws IOException { try (Scanner scanner = new Scanner(fFilePath, ENCODING.name())){ while (scanner.hasNextLine()){ processLine(scanner.nextLine()); } } } /** Overridable method for processing lines in different ways. <P>This simple default implementation expects simple name-value pairs, separated by an '=' sign. Examples of valid input: <tt>height = 167cm</tt> <tt>mass = 65kg</tt> <tt>disposition = "grumpy"</tt> <tt>this is the name = this is the value</tt> */ protected void processLine(String aLine){ //use a second Scanner to parse the content of each line Scanner scanner = new Scanner(aLine); scanner.useDelimiter("="); if (scanner.hasNext()){ //assumes the line has a certain structure String name = scanner.next(); String value = scanner.next(); log("Name is : " + quote(name.trim()) + ", and Value is : " + quote(value.trim())); } else { log("Empty or invalid line. Unable to process."); } } // PRIVATE private final Path fFilePath; private final static Charset ENCODING = StandardCharsets.UTF_8; private static void log(Object aObject){ System.out.println(String.valueOf(aObject)); } private String quote(String aText){ String QUOTE = "'"; return QUOTE + aText + QUOTE; } }
For a file containing:
height = 167cm mass = 65kg disposition = "grumpy" this is the name = this is the valuethe output of the above class is:
Name is : 'height', and Value is : '167cm' Name is : 'mass', and Value is : '65kg' Name is : 'disposition', and Value is : '"grumpy"' Name is : 'this is the name', and Value is : 'this is the value' Done.
Example 2
This example uses StringTokenizer. This class is used to parse the text entered into a search box on a web page. It returns a Set of tokens to be used for pattern matching. Here, any text appearing in quotes is treated as a single search token. All other text is split into tokens based simply on whitespace.
An example run:
>java -cp . SearchBoxParser
[mars, sun, milky way, venus]
import java.util.*; /** * The user enters text into a search box. This class is used * to parse that text into specific search terms (or tokens). * It eliminates common words, and allows for the quoting of text, using * double quotes. * JDK 7+. */ public final class SearchBoxParser { public static void main(String... aArguments) { SearchBoxParser parser = new SearchBoxParser("mars venus \"milky way\" sun"); Set<String> tokens = parser.parseSearchText(); //display the tokens System.out.println(tokens); } /** * @param aSearchText is non-null, but may have no content, * and represents what the user has input in a search box. */ public SearchBoxParser(String aSearchText) { if (aSearchText == null) { throw new IllegalArgumentException("Search Text cannot be null."); } fSearchText = aSearchText; } /** * Parse the user's search box input into a Set of String tokens. * * @return Set of Strings, one for each word in fSearchText; here "word" * is defined as either a lone word surrounded by whitespace, or as a series * of words surrounded by double quotes, "like this"; also, very common * words (and, the, etc.) do not qualify as possible search targets. */ public Set<String> parseSearchText() { Set<String> result = new LinkedHashSet<>(); boolean returnTokens = true; String currentDelims = fWHITESPACE_AND_QUOTES; StringTokenizer parser = new StringTokenizer( fSearchText, currentDelims, returnTokens ); String token = null; while (parser.hasMoreTokens()) { token = parser.nextToken(currentDelims); if (!isDoubleQuote(token)){ addNonTrivialWordToResult(token, result); } else { currentDelims = flipDelimiters(currentDelims); } } return result; } // PRIVATE private String fSearchText; private static final Set<String> fCOMMON_WORDS = new LinkedHashSet<>(); private static final String fDOUBLE_QUOTE = "\""; //the parser flips between these two sets of delimiters private static final String fWHITESPACE_AND_QUOTES = " \t\r\n\""; private static final String fQUOTES_ONLY ="\""; /**Very common words to be excluded from searches.*/ static { fCOMMON_WORDS.add("a"); fCOMMON_WORDS.add("and"); fCOMMON_WORDS.add("be"); fCOMMON_WORDS.add("for"); fCOMMON_WORDS.add("from"); fCOMMON_WORDS.add("has"); fCOMMON_WORDS.add("i"); fCOMMON_WORDS.add("in"); fCOMMON_WORDS.add("is"); fCOMMON_WORDS.add("it"); fCOMMON_WORDS.add("of"); fCOMMON_WORDS.add("on"); fCOMMON_WORDS.add("to"); fCOMMON_WORDS.add("the"); } /** * Use to determine if a particular word entered in the * search box should be discarded from the search. */ private boolean isCommonWord(String aSearchTokenCandidate){ return fCOMMON_WORDS.contains(aSearchTokenCandidate); } private boolean textHasContent(String aText){ return (aText != null) && (!aText.trim().equals("")); } private void addNonTrivialWordToResult(String aToken, Set<String> aResult){ if (textHasContent(aToken) && !isCommonWord(aToken.trim())) { aResult.add(aToken.trim()); } } private boolean isDoubleQuote(String aToken){ return aToken.equals(fDOUBLE_QUOTE); } private String flipDelimiters(String aCurrentDelims){ String result = null; if (aCurrentDelims.equals(fWHITESPACE_AND_QUOTES)){ result = fQUOTES_ONLY; } else { result = fWHITESPACE_AND_QUOTES; } return result; } }
Example 3
This example demonstrates use of regular expressions, by parsing a fully-qualified
type name into two parts - the package and the "simple" type name.
import java.util.regex.*; public final class RegularExpressions { /** * The pattern is matched to the first argument. */ public static void main (String... aArguments) { matchParts(aArguments[0]); matchAll(aArguments[0]); } /** * The Matcher.find method attempts to match *parts* of the input * to the given pattern. */ private static void matchParts(String aText){ log(fNEW_LINE + "Match PARTS:"); //note the necessity of the comments flag, since our regular //expression contains comments: Pattern pattern = Pattern.compile(fREGEXP, Pattern.COMMENTS); Matcher matcher = pattern.matcher(aText); while (matcher.find()) { log("Num groups: " + matcher.groupCount()); log("Package: " + matcher.group(1)); log("Class: " + matcher.group(2)); } } /** * The Matcher.matches method attempts to match the *entire* * input to the given pattern all at once. */ private static void matchAll(String aText){ log(fNEW_LINE + "Match ALL:"); Pattern pattern = Pattern.compile(fREGEXP, Pattern.COMMENTS); Matcher matcher = pattern.matcher(aText); if(matcher.matches()) { log("Num groups: " + matcher.groupCount()); log("Package: " + matcher.group(1)); log("Class: " + matcher.group(2)); } else { log("Input does not match pattern."); } } //PRIVATE private static final String fNEW_LINE = System.getProperty("line.separator"); private static void log(String aMessage){ log(aMessage); } /** * A commented regular expression for fully-qualified type names which * follow the common naming conventions, for example, "com.myappBlah.Thing". * * Thus, the "dot + capital letter" is sufficient to define where the * package names end. * * This regular expression uses two groups, one for the package, and one * for the class. Groups are defined by parentheses. Note that ?: will * define a group as "non-contributing"; that is, it will not contribute * to the return values of the <tt>group</tt> method. * * As you can see, regular expressions are often cryptic. */ private static final String fREGEXP = "#Group1 - Package prefix without last dot: " + fNEW_LINE + "( (?:\\w|\\.)+ ) \\." + fNEW_LINE + "#Group2 - Class name starts with uppercase: " + fNEW_LINE + "( [A-Z](?:\\w)+ )" ; }
Some example runs:
>java -cp . RegularExpressions "java.java.Thing java.lang.Random"
Match PARTS:
Num groups: 2
Package: java.java
Class: Thing
Num groups: 2
Package: java.lang
Class: Random
Match ALL:
Input does not match pattern.
>java -cp . RegularExpressions "java.java.Thing"
Match PARTS:
Num groups: 2
Package: java.java
Class: Thing
Match ALL:
Num groups: 2
Package: java.java
Class: Thing
>java -cp . RegularExpressions "java.java.Thing "
Match PARTS:
Num groups: 2
Package: java.java
Class: Thing
Match ALL:
Input does not match pattern.