This example uses Scanner
. Here, the contents of a file containing name-value
pairs is read, and each line is parsed into its constituent data.
import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Objects; import java.util.Scanner; /** Assumes UTF-8 encoding. JDK 7+. */ public final class ReadWithScanner { public static void main(String... args) throws IOException { ReadWithScanner parser = new ReadWithScanner("C:\\Temp\\test.txt"); parser.processLineByLine(); log("Done."); } /** Constructor. @param fileName full name of an existing, readable file. */ public ReadWithScanner(String fileName){ filePath = Paths.get(fileName); } /** Template method that calls {@link #processLine(String)}. */ public final void processLineByLine() throws IOException { try (Scanner scanner = new Scanner(filePath, ENCODING.name())){ while (scanner.hasNextLine()){ processLine(scanner.nextLine()); } } } /** Overridable method for processing lines in different ways. <P>This simple default implementation expects simple name-value pairs, separated by an '=' sign. Examples of valid input: <tt>height = 167cm</tt> <tt>mass = 65kg</tt> <tt>disposition = "grumpy"</tt> <tt>this is the name = this is the value</tt> */ protected void processLine(String line){ //use a second Scanner to parse the content of each line try(Scanner scanner = new Scanner(line)){ scanner.useDelimiter("="); if (scanner.hasNext()){ //assumes the line has a certain structure String name = scanner.next(); String value = scanner.next(); log("Name is : " + quote(name.trim()) + ", and Value is : " + quote(value.trim())); } else { log("Empty or invalid line. Unable to process."); } } } // PRIVATE private final Path filePath; private final static Charset ENCODING = StandardCharsets.UTF_8; private static void log(Object object){ System.out.println(Objects.toString(object)); } private String quote(String text){ String QUOTE = "'"; return QUOTE + text + QUOTE; } }
height = 167cm mass = 65kg disposition = "grumpy" this is the name = this is the valuethe output of the above class is:
Name is : 'height', and Value is : '167cm' Name is : 'mass', and Value is : '65kg' Name is : 'disposition', and Value is : '"grumpy"' Name is : 'this is the name', and Value is : 'this is the value' Done.
Example 2
This example uses StringTokenizer
.
This class is used to parse the text entered into a search box on a web page. It
returns a Set
of tokens to be used for pattern
matching. Here, any text appearing in quotes is treated as a single search token. All
other text is split into tokens based simply on whitespace.
An example run:
>java -cp . SearchBoxParser
[mars, sun, milky way, venus]
import java.util.*; /** The user enters text into a search box. This class is used to parse that text into specific search terms (or tokens). It eliminates common words, and allows for the quoting of text, using double quotes. JDK 7+. */ public final class SearchBoxParser { public static void main(String... arguments) { SearchBoxParser parser = new SearchBoxParser("mars venus \"milky way\" sun"); Set<String> tokens = parser.parseSearchText(); //display the tokens System.out.println(tokens); } /** @param searchText is non-null, but may have no content, and represents what the user has input in a search box. */ public SearchBoxParser(String searchText) { if (searchText == null) { throw new IllegalArgumentException("Search Text cannot be null."); } this.searchText = searchText; } /** Parse the user's search box input into a Set of String tokens. @return Set of Strings, one for each word in fSearchText; here "word" is defined as either a lone word surrounded by whitespace, or as a series of words surrounded by double quotes, "like this"; also, very common words (and, the, etc.) do not qualify as possible search targets. */ public Set<String> parseSearchText() { Set<String> result = new LinkedHashSet<>(); boolean returnTokens = true; String currentDelims = WHITESPACE_AND_QUOTES; StringTokenizer parser = new StringTokenizer( searchText, currentDelims, returnTokens ); String token = null; while (parser.hasMoreTokens()) { token = parser.nextToken(currentDelims); if (!isDoubleQuote(token)){ addNonTrivialWordToResult(token, result); } else { currentDelims = flipDelimiters(currentDelims); } } return result; } // PRIVATE private String searchText; private static final Set<String> COMMON_WORDS = new LinkedHashSet<>(); private static final String DOUBLE_QUOTE = "\""; //the parser flips between these two sets of delimiters private static final String WHITESPACE_AND_QUOTES = " \t\r\n\""; private static final String QUOTES_ONLY ="\""; /**Very common words to be excluded from searches.*/ static { COMMON_WORDS.add("a"); COMMON_WORDS.add("and"); COMMON_WORDS.add("be"); COMMON_WORDS.add("for"); COMMON_WORDS.add("from"); COMMON_WORDS.add("has"); COMMON_WORDS.add("i"); COMMON_WORDS.add("in"); COMMON_WORDS.add("is"); COMMON_WORDS.add("it"); COMMON_WORDS.add("of"); COMMON_WORDS.add("on"); COMMON_WORDS.add("to"); COMMON_WORDS.add("the"); } /** * Use to determine if a particular word entered in the * search box should be discarded from the search. */ private boolean isCommonWord(String searchTokenCandidate){ return COMMON_WORDS.contains(searchTokenCandidate); } private boolean textHasContent(String text){ return (text != null) && (!text.trim().equals("")); } private void addNonTrivialWordToResult(String token, Set<String> result){ if (textHasContent(token) && !isCommonWord(token.trim())) { result.add(token.trim()); } } private boolean isDoubleQuote(String token){ return token.equals(DOUBLE_QUOTE); } private String flipDelimiters(String currentDelims){ String result = null; if (currentDelims.equals(WHITESPACE_AND_QUOTES)){ result = QUOTES_ONLY; } else { result = WHITESPACE_AND_QUOTES; } return result; } }
This example demonstrates use of regular expressions, by parsing a fully-qualified
type name into two parts - the package and the "simple" type name.
import java.util.Objects; import java.util.regex.*; public final class RegularExpressions { /** The pattern is matched to the first argument. */ public static void main (String... args) { matchParts(args[0]); matchAll(args[0]); } /** The Matcher.find method attempts to match *parts* of the input to the given pattern. */ private static void matchParts(String text){ log(NEW_LINE + "Match PARTS:"); //note the necessity of the comments flag, since our regular //expression contains comments: Pattern pattern = Pattern.compile(REGEXP, Pattern.COMMENTS); Matcher matcher = pattern.matcher(text); while (matcher.find()) { log("Num groups: " + matcher.groupCount()); log("Package: " + matcher.group(1)); log("Class: " + matcher.group(2)); } } /** The Matcher.matches method attempts to match the *entire* input to the given pattern all at once. */ private static void matchAll(String text){ log(NEW_LINE + "Match ALL:"); Pattern pattern = Pattern.compile(REGEXP, Pattern.COMMENTS); Matcher matcher = pattern.matcher(text); if(matcher.matches()) { log("Num groups: " + matcher.groupCount()); log("Package: " + matcher.group(1)); log("Class: " + matcher.group(2)); } else { log("Input does not match pattern."); } } //PRIVATE private static final String NEW_LINE = System.getProperty("line.separator"); private static void log(String msg){ System.out.println(Objects.toString(msg)); } /** A commented regular expression for fully-qualified type names which follow the common naming conventions, for example, "com.myappBlah.Thing". Thus, the "dot + capital letter" is sufficient to define where the package names end. This regular expression uses two groups, one for the package, and one for the class. Groups are defined by parentheses. Note that ?: will define a group as "non-contributing"; that is, it will not contribute to the return values of the <tt>group</tt> method. As you can see, regular expressions are often cryptic. */ private static final String REGEXP = "#Group1 - Package prefix without last dot: " + NEW_LINE + "( (?:\\w|\\.)+ ) \\." + NEW_LINE + "#Group2 - Class name starts with uppercase: " + NEW_LINE + "( [A-Z](?:\\w)+ )" ; }
>java -cp . RegularExpressions "java.java.Thing java.lang.Random"
Match PARTS:
Num groups: 2
Package: java.java
Class: Thing
Num groups: 2
Package: java.lang
Class: Random
Match ALL:
Input does not match pattern.
>java -cp . RegularExpressions "java.java.Thing"
Match PARTS:
Num groups: 2
Package: java.java
Class: Thing
Match ALL:
Num groups: 2
Package: java.java
Class: Thing
>java -cp . RegularExpressions "java.java.Thing "
Match PARTS:
Num groups: 2
Package: java.java
Class: Thing
Match ALL:
Input does not match pattern.