Parse text

There are various ways of parsing text. The usual tools are: Example 1

This example uses Scanner. Here, the contents of a file containing name-value pairs is read, and each line is parsed into its constituent data.

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Objects;
import java.util.Scanner;

/** Assumes UTF-8 encoding. JDK 7+. */
public class ReadWithScanner {

  public static void main(String... aArgs) throws IOException {
    ReadWithScanner parser = new ReadWithScanner("C:\\Temp\\test.txt");
    parser.processLineByLine();
    log("Done.");
  }
  
  /**
   Constructor.
   @param aFileName full name of an existing, readable file.
  */
  public ReadWithScanner(String aFileName){
    fFilePath = Paths.get(aFileName);
  }
  
  
  /** Template method that calls {@link #processLine(String)}.  */
  public final void processLineByLine() throws IOException {
    try (Scanner scanner =  new Scanner(fFilePath, ENCODING.name())){
      while (scanner.hasNextLine()){
        processLine(scanner.nextLine());
      }      
    }
  }
  
  /** 
   Overridable method for processing lines in different ways.
    
   <P>This simple default implementation expects simple name-value pairs, separated by an 
   '=' sign. Examples of valid input: 
   <tt>height = 167cm</tt>
   <tt>mass =  65kg</tt>
   <tt>disposition =  "grumpy"</tt>
   <tt>this is the name = this is the value</tt>
  */
  protected void processLine(String aLine){
    //use a second Scanner to parse the content of each line 
    try(Scanner scanner = new Scanner(aLine)){
      scanner.useDelimiter("=");
      if (scanner.hasNext()){
        //assumes the line has a certain structure
        String name = scanner.next();
        String value = scanner.next();
        log("Name is : " + quote(name.trim()) + ", and Value is : " + quote(value.trim()));
      }
      else {
        log("Empty or invalid line. Unable to process.");
      }
    }
  }
  
  // PRIVATE 
  private final Path fFilePath;
  private final static Charset ENCODING = StandardCharsets.UTF_8;  
  
  private static void log(Object aObject){
    System.out.println(Objects.toString(aObject));
  }
  
  private String quote(String aText){
    String QUOTE = "'";
    return QUOTE + aText + QUOTE;
  }
} 

For a file containing:

height = 167cm
mass =  65kg
disposition =  "grumpy"
this is the name = this is the value
the output of the above class is:
Name is : 'height', and Value is : '167cm'
Name is : 'mass', and Value is : '65kg'
Name is : 'disposition', and Value is : '"grumpy"'
Name is : 'this is the name', and Value is : 'this is the value'
Done.

Example 2

This example uses StringTokenizer. This class is used to parse the text entered into a search box on a web page. It returns a Set of tokens to be used for pattern matching. Here, any text appearing in quotes is treated as a single search token. All other text is split into tokens based simply on whitespace.

An example run:

>java -cp . SearchBoxParser
[mars, sun, milky way, venus]

import java.util.*;

/**
* The user enters text into a search box. This class is used
* to parse that text into specific search terms (or tokens).
* It eliminates common words, and allows for the quoting of text, using
* double quotes.
* JDK 7+.
*/
public final class SearchBoxParser {

  public static void main(String... aArguments) {
    SearchBoxParser parser = new SearchBoxParser("mars venus \"milky way\" sun");
    Set<String> tokens = parser.parseSearchText();
    //display the tokens
    System.out.println(tokens);
  }

  /**
  * @param aSearchText is non-null, but may have no content,
  * and represents what the user has input in a search box.
  */
  public SearchBoxParser(String aSearchText) {
    if (aSearchText == null) {
      throw new IllegalArgumentException("Search Text cannot be null.");
    }
    fSearchText = aSearchText;
  }

  /**
  * Parse the user's search box input into a Set of String tokens.
  *
  * @return Set of Strings, one for each word in fSearchText; here "word"
  * is defined as either a lone word surrounded by whitespace, or as a series
  * of words surrounded by double quotes, "like this"; also, very common
  * words (and, the, etc.) do not qualify as possible search targets.
  */
  public Set<String> parseSearchText() {
    Set<String> result = new LinkedHashSet<>();

    boolean returnTokens = true;
    String currentDelims = fWHITESPACE_AND_QUOTES;
    StringTokenizer parser = new StringTokenizer(
      fSearchText, currentDelims, returnTokens
    );

    String token = null;
    while (parser.hasMoreTokens()) {
      token = parser.nextToken(currentDelims);
      if (!isDoubleQuote(token)){
        addNonTrivialWordToResult(token, result);
      }
      else {
        currentDelims = flipDelimiters(currentDelims);
      }
    }
    return result;
  }

  // PRIVATE 
  private String fSearchText;
  private static final Set<String> fCOMMON_WORDS = new LinkedHashSet<>();
  private static final String fDOUBLE_QUOTE = "\"";

  //the parser flips between these two sets of delimiters
  private static final String fWHITESPACE_AND_QUOTES = " \t\r\n\"";
  private static final String fQUOTES_ONLY ="\"";

  /**Very common words to be excluded from searches.*/
  static {
    fCOMMON_WORDS.add("a");
    fCOMMON_WORDS.add("and");
    fCOMMON_WORDS.add("be");
    fCOMMON_WORDS.add("for");
    fCOMMON_WORDS.add("from");
    fCOMMON_WORDS.add("has");
    fCOMMON_WORDS.add("i");
    fCOMMON_WORDS.add("in");
    fCOMMON_WORDS.add("is");
    fCOMMON_WORDS.add("it");
    fCOMMON_WORDS.add("of");
    fCOMMON_WORDS.add("on");
    fCOMMON_WORDS.add("to");
    fCOMMON_WORDS.add("the");
  }

  /**
  * Use to determine if a particular word entered in the
  * search box should be discarded from the search.
  */
  private boolean isCommonWord(String aSearchTokenCandidate){
    return fCOMMON_WORDS.contains(aSearchTokenCandidate);
  }

  private boolean textHasContent(String aText){
    return (aText != null) && (!aText.trim().equals(""));
  }

  private void addNonTrivialWordToResult(String aToken, Set<String> aResult){
    if (textHasContent(aToken) && !isCommonWord(aToken.trim())) {
      aResult.add(aToken.trim());
    }
  }

  private boolean isDoubleQuote(String aToken){
    return aToken.equals(fDOUBLE_QUOTE);
  }

  private String flipDelimiters(String aCurrentDelims){
    String result = null;
    if (aCurrentDelims.equals(fWHITESPACE_AND_QUOTES)){
      result = fQUOTES_ONLY;
    }
    else {
      result = fWHITESPACE_AND_QUOTES;
    }
    return result;
  }
} 

Example 3

This example demonstrates use of regular expressions, by parsing a fully-qualified type name into two parts - the package and the "simple" type name.

import java.util.Objects;
import java.util.regex.*;

public final class RegularExpressions {

  /** The pattern is matched to the first argument. */
  public static void main (String... aArguments) {
    matchParts(aArguments[0]);
    matchAll(aArguments[0]);
  }

  /**
   The Matcher.find method attempts to match *parts* of the input
   to the given pattern.
  */
  private static void matchParts(String aText){
    log(fNEW_LINE + "Match PARTS:");
    //note the necessity of the comments flag, since our regular
    //expression contains comments:
    Pattern pattern = Pattern.compile(fREGEXP, Pattern.COMMENTS);
    Matcher matcher = pattern.matcher(aText);
    while (matcher.find()) {
      log("Num groups: " + matcher.groupCount());
      log("Package: " + matcher.group(1));
      log("Class: " + matcher.group(2));
    }
  }

  /**
   The Matcher.matches method attempts to match the *entire*
   input to the given pattern all at once.
  */
  private static void matchAll(String aText){
    log(fNEW_LINE + "Match ALL:");
    Pattern pattern = Pattern.compile(fREGEXP, Pattern.COMMENTS);
    Matcher matcher = pattern.matcher(aText);
    if(matcher.matches()) {
      log("Num groups: " + matcher.groupCount());
      log("Package: " + matcher.group(1));
      log("Class: " + matcher.group(2));
    }
    else {
      log("Input does not match pattern.");
    }
  }

  //PRIVATE

  private static final String fNEW_LINE = System.getProperty("line.separator");
  
  private static void log(String aMessage){
    System.out.println(Objects.toString(aMessage));
  }

  /**
   A commented regular expression for fully-qualified type names which
   follow the common naming conventions, for example, "com.myappBlah.Thing".
  
   Thus, the "dot + capital letter" is sufficient to define where the
   package names end.
  
   This regular expression uses two groups, one for the package, and one
   for the class. Groups are defined by parentheses. Note that ?: will
   define a group as "non-contributing"; that is, it will not contribute
   to the return values of the <tt>group</tt> method.
   
   As you can see, regular expressions are often cryptic.
  */
  private static final String fREGEXP =
    "#Group1 - Package prefix without last dot: " + fNEW_LINE +
    "( (?:\\w|\\.)+ ) \\." + fNEW_LINE +
    "#Group2 - Class name starts with uppercase: " + fNEW_LINE +
    "( [A-Z](?:\\w)+ )"
  ;
} 

Some example runs:

>java -cp . RegularExpressions "java.java.Thing java.lang.Random"

Match PARTS:
Num groups: 2
Package: java.java
Class: Thing
Num groups: 2
Package: java.lang
Class: Random

Match ALL:
Input does not match pattern.

>java -cp . RegularExpressions "java.java.Thing"

Match PARTS:
Num groups: 2
Package: java.java
Class: Thing

Match ALL:
Num groups: 2
Package: java.java
Class: Thing

>java -cp . RegularExpressions "java.java.Thing "

Match PARTS:
Num groups: 2
Package: java.java
Class: Thing

Match ALL:
Input does not match pattern.

See Also :
Reading and writing text files
Pattern match lines of a file
Compile regular expressions once