老牌的網路連線函式庫。
http://curl.haxx.se/libcurl/c/example.html
看範例,
makefile的寫法
CFLAGS=-g -Wall -O3 curl-config --cflags
-I/usr/include/libxml2
LDLIBS=curl-config --libs
-lxml2 -lpthread
CC=gcc
把網頁的HTML碼,寫到畫面上,
http://curl.haxx.se/libcurl/c/simple.html
make simple 可編譯
#include <stdio.h>
#include <curl/curl.h>
int main(void)
{
CURL *curl;
CURLcode res;
curl = curl_easy_init();
if(curl) {
curl_easy_setopt(curl, CURLOPT_URL, "http://example.com");
/* example.com is redirected, so we tell libcurl to follow redirection */
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
/* Check for errors */
if(res != CURLE_OK)
fprintf(stderr, "curl_easy_perform() failed: %s\n",
curl_easy_strerror(res));
/* always cleanup */
curl_easy_cleanup(curl);
}
return 0;
}
這樣就把網頁抓回來了!!
curl 的API命名原則,非常一致。
curl_easy_init();
curl_easy_setopt
curl_easy_cleanup(curl);
包裝的很簡單。
另一個範例,抓網頁,並用libxml2解析TAG。
在下載的源碼中,docs/examples/htmltitle.cpp
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <string>
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
//
// Case-insensitive string comparison
//
#ifdef _MSC_VER
#define COMPARE(a, b) (!stricmp((a), (b)))
#else
#define COMPARE(a, b) (!strcasecmp((a), (b)))
#endif
//
// libxml callback context structure
//
struct Context
{
Context(): addTitle(false) { }
bool addTitle;
std::string title;
};
//
// libcurl variables for error strings and returned data
static char errorBuffer[CURL_ERROR_SIZE];
static std::string buffer;
//
// libcurl write callback function
//
static int writer(char *data, size_t size, size_t nmemb,
std::string *writerData)
{
if (writerData == NULL)
return 0;
writerData->append(data, size*nmemb);
return size * nmemb;
}
//
// libcurl connection initialization
//
static bool init(CURL *&conn, char *url)
{
CURLcode code;
conn = curl_easy_init();
if (conn == NULL)
{
fprintf(stderr, "Failed to create CURL connection\n");
exit(EXIT_FAILURE);
}
code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set error buffer [%d]\n", code);
return false;
}
code = curl_easy_setopt(conn, CURLOPT_URL, url);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
return false;
}
code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
return false;
}
code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
return false;
}
code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
return false;
}
return true;
}
//
// libxml start element callback function
//
static void StartElement(void *voidContext,
const xmlChar *name,
const xmlChar **attributes)
{
Context *context = (Context *)voidContext;
if (COMPARE((char *)name, "TITLE"))
{
context->title = "";
context->addTitle = true;
}
(void) attributes;
}
//
// libxml end element callback function
//
static void EndElement(void *voidContext,
const xmlChar *name)
{
Context *context = (Context *)voidContext;
if (COMPARE((char *)name, "TITLE"))
context->addTitle = false;
}
//
// Text handling helper function
//
static void handleCharacters(Context *context,
const xmlChar *chars,
int length)
{
if (context->addTitle)
context->title.append((char *)chars, length);
}
//
// libxml PCDATA callback function
//
static void Characters(void *voidContext,
const xmlChar *chars,
int length)
{
Context *context = (Context *)voidContext;
handleCharacters(context, chars, length);
}
//
// libxml CDATA callback function
//
static void cdata(void *voidContext,
const xmlChar *chars,
int length)
{
Context *context = (Context *)voidContext;
handleCharacters(context, chars, length);
}
//
// libxml SAX callback structure
//
static htmlSAXHandler saxHandler =
{
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
StartElement,
EndElement,
NULL,
Characters,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
cdata,
NULL
};
//
// Parse given (assumed to be) HTML text and return the title
//
static void parseHtml(const std::string &html,
std::string &title)
{
htmlParserCtxtPtr ctxt;
Context context;
ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
XML_CHAR_ENCODING_NONE);
htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
htmlParseChunk(ctxt, "", 0, 1);
htmlFreeParserCtxt(ctxt);
title = context.title;
}
int main(int argc, char *argv[])
{
CURL *conn = NULL;
CURLcode code;
std::string title;
// Ensure one argument is given
if (argc != 2)
{
fprintf(stderr, "Usage: %s <url>\n", argv[0]);
exit(EXIT_FAILURE);
}
curl_global_init(CURL_GLOBAL_DEFAULT);
// Initialize CURL connection
if (!init(conn, argv[1]))
{
fprintf(stderr, "Connection initializion failed\n");
exit(EXIT_FAILURE);
}
// Retrieve content for the URL
code = curl_easy_perform(conn);
curl_easy_cleanup(conn);
if (code != CURLE_OK)
{
fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
exit(EXIT_FAILURE);
}
// Parse the (assumed) HTML code
parseHtml(buffer, title);
// Display the extracted title
printf("Title: %s\n", title.c_str());
return EXIT_SUCCESS;
}
編譯的方法為,
$ g++ -Wall -I/usr/curl/include -I/usr/include/libxml2 htmltitle.cpp \
-o htmltitle
curl-config --libs
-lxml2 -lpthread
除了一些std**::**string title; 感覺幾乎就是C語言。卻要用g++來編譯。
我用上面的makefile來編譯,會找不到
htmltitle.cpp:36:31: fatal error: libxml/HTMLparser.h: 沒有此一檔案或目錄
#include <libxml/HTMLparser.h>
就我的電腦,路徑確實該處,
$ sudo find / -type f -name HTMLparser.h
/usr/include/libxml2/libxml/HTMLparser.h