Salut,
Même si la structure des réponses google a changé depuis, voici un exemple d'extraction des résultats google avec htmlparser (tout est dans print) :
package parsers;
import java.util.regex.Matcher; import java.util.regex.Pattern;
import org.apache.commons.httpclient.NameValuePair; import org.htmlparser.Attribute; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.Tag; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.Div; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException;
import commons.HttpConstants; import commons.HttpConstants.MethodType; import commons.HttpRequest; import commons.SearchResults;
public class GoogleResultsParser extends HttpRequest {
private static final String PROTOCOL = HttpConstants.DEFAULT_PROTOCOL; private static final String USER = HttpConstants.DEFAULT_USER; private static final String PASS = HttpConstants.DEFAULT_PASS; private static final String HOST = "www.google.com"; private static final int PORT = HttpConstants.DEFAULT_PORT; private static final String PATH = HttpConstants.DEFAULT_PATH; private static final String FILE = "search"; private static final String QUERY = "q=httpclient&start="; private static final String REFERENCE = HttpConstants.DEFAULT_REFERENCE; private static final NameValuePair[] PARAMETERS = HttpConstants.DEFAULT_PARAMETERS;
private static final Pattern googleTitle = Pattern.compile("<a href=\"([^\"]*)\" class=l>([^<]*)</a>"); private static final Pattern googleText = Pattern.compile("<td class=j><font size=-1>([^<]*)<br>");
private NodeFilter filter; private NodeList list; private SearchResults results; public GoogleResultsParser(String protocol, String user, String pass, String host, int port, String path, String file, String query, String reference, NameValuePair[] parameters) { super(MethodType.GET, protocol, user, pass, host, port, path, file, query, reference, parameters); filter = new NodeClassFilter (Div.class); list = new NodeList(); results = new SearchResults(); } public void parse(String source) { try { list.add(new Parser (source).extractAllNodesThatMatch (filter)); } catch (ParserException e) { e.printStackTrace (); } }
public int print() { int nbLinks = 0; Attribute attribute; for (int i = 0; i < list.size(); i++){ if ((attribute = ((Tag) list.elementAt(i)).getAttributeEx("class")) != null && attribute.getValue().equals("g")) { Matcher mGoogleTitle = googleTitle.matcher(list.elementAt(i).getFirstChild().toHtml(). replaceAll("<b>", "").replaceAll("</b>", "")); Matcher mGoogleText = googleText.matcher(list.elementAt(i).getLastChild().getFirstChild(). getFirstChild().toHtml().replaceAll("<b>", "").replaceAll("</b>", "")); if (mGoogleTitle.find()) { results.add(mGoogleTitle.group(1), results.new Result(mGoogleTitle.group(2), mGoogleText.find() ? mGoogleText.group(1) : "")); nbLinks++; } } } results.print(); return nbLinks; } public static void main(String[] args) { // read content of url using a GoogleResultsParser GoogleResultsParser client = new GoogleResultsParser(PROTOCOL, USER, PASS, HOST, PORT, PATH, FILE, QUERY, REFERENCE, PARAMETERS); for ( int i = 0; i < 5; i++ ) { client.buildURL(PROTOCOL, USER, PASS, HOST, PORT, PATH, FILE, QUERY+(i*10), REFERENCE); String response = client.read(); if ( response.length() != 0 ) client.parse(response); } System.out.println("\nnbResponses = " + client.print()); } }
|