If you need data from a site, but the site has no formal web API, then you usually have no choice but to fetch
HTML from the site, parse it, and extract the required data.
import java.io.*; import java.net.*; import java.util.Objects; import java.util.Scanner; /** Fetch the text of a web page (or HTTP header) as a String. The same sort of code can be used to fetch any kind of text: HTML, JSON, XML, plain text, and so on. */ public final class WebPageFetcher { /** Demo harness. <ul> <li>args[0] : an HTTP URL <li>args[1] : (header | content) </ul> */ public static void main(String... args) throws MalformedURLException { String url = args[0]; String option = args[1]; WebPageFetcher fetcher = new WebPageFetcher(url); if (HEADER.equalsIgnoreCase(option)) { log(fetcher.getPageHeader()); } else if (CONTENT.equalsIgnoreCase(option)) { log(fetcher.getPageContent()); } else { log("Unknown option."); } } public WebPageFetcher(URL url){ if (! HTTP.equals(url.getProtocol())) { throw new IllegalArgumentException("URL is not for HTTP Protocol: " + url); } this.url = url; } public WebPageFetcher(String urlName) throws MalformedURLException { this(new URL(urlName)); } /** Fetch the content of the URL as simple text.*/ public String getPageContent() { String result = null; URLConnection connection = null; try { connection = url.openConnection(); try(Scanner scanner = new Scanner(connection.getInputStream())){ scanner.useDelimiter(END_OF_INPUT); result = scanner.next(); } } catch (IOException ex) { log("Cannot open connection to " + url.toString()); } return result; } /** Fetch all HTTP headers as simple text. One header per line, as a 'key : value' pair. */ public String getPageHeader(){ StringBuilder result = new StringBuilder(); URLConnection connection = null; try { connection = url.openConnection(); } catch (IOException ex) { log("Cannot open connection to URL: " + url); } //not all headers come in key-value pairs - sometimes the key is //null or an empty String int headerIdx = 0; String headerKey = null; String headerValue = null; while ( (headerValue = connection.getHeaderField(headerIdx)) != null ) { headerKey = connection.getHeaderFieldKey(headerIdx); if (headerKey != null && headerKey.length()>0) { result.append(headerKey); result.append(" : "); } result.append(headerValue); result.append(NEWLINE); headerIdx++; } return result.toString(); } // PRIVATE private URL url; private static final String HTTP = "http"; private static final String HEADER = "header"; private static final String CONTENT = "content"; private static final String END_OF_INPUT = "\\Z"; private static final String NEWLINE = System.getProperty("line.separator"); private static void log(Object thing){ System.out.println(Objects.toString(thing)); } }
>java -cp . WebPageFetcher http://www.date4j.net/ header HTTP/1.1 200 OK Date : Tue, 14 Nov 2017 00:24:05 GMT Accept-Ranges : bytes ETag : W/"21757-1441143396000" Last-Modified : Tue, 01 Sep 2015 21:36:36 GMT Content-Type : text/html; charset=UTF-8 Content-Length : 21757 Vary : Accept-Encoding Keep-Alive : timeout=3, max=100 Connection : Keep-Alive