Fetch web page and header
Here's an example of programmatically fetching the HTML content of a web page as simple text.
This could be used, for example, to fetch stock prices or the weather forecast from the web - the raw HTML is first fetched, then the desired content is extracted and presented in some customized manner.
Of course, if the web site publishes a true API, and serves structured data using
JSON, XML, or similar, then that should be used instead of fetching hypertext.
import java.io.*; import java.net.*; import java.util.Scanner; /** Fetches the HTML content of a web page (or HTTP header) as a String. */ public final class WebPageFetcher { /** * Demo harness. * * <ul> * <li>aArgs[0] : an HTTP URL * <li>aArgs[1] : (header | content) * </ul> */ public static void main(String... aArgs) throws MalformedURLException { String url = aArgs[0]; String option = aArgs[1]; WebPageFetcher fetcher = new WebPageFetcher(url); if (HEADER.equalsIgnoreCase(option)) { log(fetcher.getPageHeader()); } else if (CONTENT.equalsIgnoreCase(option)) { log(fetcher.getPageContent()); } else { log("Unknown option."); } } public WebPageFetcher(URL aURL){ if (! HTTP.equals(aURL.getProtocol())) { throw new IllegalArgumentException("URL is not for HTTP Protocol: " + aURL); } fURL = aURL; } public WebPageFetcher(String aUrlName) throws MalformedURLException { this(new URL(aUrlName)); } /** Fetch the HTML content of the page as simple text.*/ public String getPageContent() { String result = null; URLConnection connection = null; try { connection = fURL.openConnection(); Scanner scanner = new Scanner(connection.getInputStream()); scanner.useDelimiter(END_OF_INPUT); result = scanner.next(); } catch (IOException ex) { log("Cannot open connection to " + fURL.toString()); } return result; } /** Fetch HTML headers as simple text. */ public String getPageHeader(){ StringBuilder result = new StringBuilder(); URLConnection connection = null; try { connection = fURL.openConnection(); } catch (IOException ex) { log("Cannot open connection to URL: " + fURL); } //not all headers come in key-value pairs - sometimes the key is //null or an empty String int headerIdx = 0; String headerKey = null; String headerValue = null; while ( (headerValue = connection.getHeaderField(headerIdx)) != null ) { headerKey = connection.getHeaderFieldKey(headerIdx); if (headerKey != null && headerKey.length()>0) { result.append(headerKey); result.append(" : "); } result.append(headerValue); result.append(NEWLINE); headerIdx++; } return result.toString(); } // PRIVATE private URL fURL; private static final String HTTP = "http"; private static final String HEADER = "header"; private static final String CONTENT = "content"; private static final String END_OF_INPUT = "\\Z"; private static final String NEWLINE = System.getProperty("line.separator"); private static void log(Object aObject){ System.out.println(aObject); } }
An example run, fetching an HTTP header from google.com:
>java -cp . WebPageFetcher http://www.google.com/ header HTTP/1.1 200 OK Cache-Control : private Content-Type : text/html; charset=ISO-8859-1 Server : GWS/2.1 Transfer-Encoding : chunked Date : Wed, 29 Aug 2007 13:21:40 GMT
Would you use this technique?