缘起:以前一直喜欢用scrapy做爬虫,并且实践效果也很好,后来由于单位让自己写一套分布式爬虫(python实现),替代公司原有的爬虫(php实现),大致用于实践后,发现效果是比原来的效果好,原来能做配置的网站20个里能配置10个,现在20个里能配置16个,分析原因,是架构设计方面有那么一点点扩充性,在大致架构不变的基础上,可进行有限的扩展,而其实实现的原理都是通过CURL来实现的。

php的curl,是在php发布程序的ext文件中,作为一个php自带的支持,需要改写php的配置文件,修改php.ini,将;extension=php_curl.dll前的分号去掉。

python的pycurl,不是python自带的支持程序,python在做爬虫一般都是用urllib,urllib2,twisted等,比较少的使用pycurl.安装略.

c的curl,是前面2个语言的curl父程序,是c的curl才有了php的curl和python的pycurl,同时,python的pycurl文档说明了只实现了部分功能,即是一个c的curl的阉割版。泪奔,原来用了那么长时间的东西,连冰山一角都没触碰,或者python的pycurl也只是会用其中的一个或少数几个功能。

如何用:

C的curl:

#include <stdio.h>
#include <curl/curl.h>

int main(void)
{
  CURL *curl;
  CURLcode res;

  curl = curl_easy_init();
if(curl) {
/* First set the URL that is about to receive our POST. This URL can
       just as well be a https:// URL if that is what should receive the
       data. */
    curl_easy_setopt(curl, CURLOPT_URL, "http://postit.example.com/moo.cgi");
/* Now specify the POST data */
    curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "name=daniel&project=curl");

/* Perform the request, res will get the return code */
    res = curl_easy_perform(curl);

/* always cleanup */
    curl_easy_cleanup(curl);
  }
return 0;
}



php的curl:



<?php

$c = curl_init();
curl_setopt($c, CURLOPT_URL, 'http://www.baidu.com');
$data = curl_exec($c);
curl_close($c);
echo $c;

?>



python的pycurl:



import pycurl
def body(buffer):
print buffer
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.baidu.com/")
c.setopt(pycurl.WRITEFUNCTION, body)
c.perform()



主要原理:

C:

使用到的数据结构:



typedef void CURL;  /*当初始化什么的时候只是一个void类型*/
struct SessionHandle {
struct Names dns;
struct Curl_multi *multi;    /* 用于多线程处理*/
struct Curl_one_easy *multi_pos; /* if non-NULL, points to the its position
                                      in multi controlling structure to assist
                                      in removal. */
struct Curl_share *share;    /* Share, handles global variable mutexing */
struct HandleData reqdata;   /* Request-specific data */
struct UserDefined set;      /* values set by the libcurl user ,用于setopt等*/
struct DynamicStatic change; /* possibly modified userdefined data */

struct CookieInfo *cookies;  /* the cookies, read from files and servers */
struct Progress progress;    /* for all the progress meter data */
struct UrlState state;       /* struct for fields used for state info and
                                  other dynamic purposes */
struct PureInfo info;        /* stats, reports and info data */
#if defined(CURL_DOES_CONVERSIONS) && defined(HAVE_ICONV)
  iconv_t outbound_cd;         /* for translating to the network encoding */
  iconv_t inbound_cd;          /* for translating from the network encoding */
  iconv_t utf8_cd;             /* for translating to UTF8 */
#endif /* CURL_DOES_CONVERSIONS && HAVE_ICONV */
  unsigned int magic;          /* set to a CURLEASY_MAGIC_NUMBER */
};

struct UserDefined {
  FILE *err;         /* the stderr user data goes here */
void *debugdata;   /* the data that will be passed to fdebug */
char *errorbuffer; /* (Static) store failure messages in here */
long proxyport; /* If non-zero, use this port number by default. If the
                     proxy string features a ":[port]" that one will override
                     this. */
 /**一下省略10000行- -**/
};



使用的方法1:



1.初始化curl,得到sessionhandler结构体空间
CURL *curl_easy_init(void)
{
  CURLcode res;
struct SessionHandle *data;

/* Make sure we inited the global SSL stuff */
if (!initialized) {
    res = curl_global_init(CURL_GLOBAL_DEFAULT);
if(res) {
/* something in the global init failed, return nothing */
      DEBUGF(fprintf(stderr, "Error: curl_global_init failed\n"));
return NULL;
    }
  }

/* We use curl_open() with undefined URL so far */
  res = Curl_open(&data);
if(res != CURLE_OK) {
    DEBUGF(fprintf(stderr, "Error: Curl_open failed\n"));
return NULL;
  }

return data;
}



方法2.



设置参数:
CURLcode curl_easy_setopt(CURL *curl, CURLoption tag, ...)
{
  va_list arg;
struct SessionHandle *data = curl;
  CURLcode ret;

if(!curl)
return CURLE_BAD_FUNCTION_ARGUMENT;

  va_start(arg, tag);

  ret = Curl_setopt(data, tag, arg);

  va_end(arg);
return ret;
}
CURLcode Curl_setopt(struct SessionHandle *data, CURLoption option,
                     va_list param)
{
char *argptr;
  CURLcode result = CURLE_OK;
#ifndef CURL_DISABLE_HTTP
  curl_off_t bigsize;
#endif

switch(option) {
case CURLOPT_DNS_CACHE_TIMEOUT:
    data->set.dns_cache_timeout = va_arg(param, long);
break;
case CURLOPT_DNS_USE_GLOBAL_CACHE:
    {
long use_cache = va_arg(param, long);
if (use_cache)
        Curl_global_host_cache_init();

      data->set.global_dns_cache = (bool)(0 != use_cache);
    }
break;
case CURLOPT_SSL_CIPHER_LIST:
/* set a list of cipher we want to use in the SSL connection */
    result = Curl_setstropt(&data->set.str[STRING_SSL_CIPHER_LIST],
                            va_arg(param, char *));
break;

case CURLOPT_RANDOM_FILE:
/*
     * This is the path name to a file that contains random data to seed
     * the random SSL stuff with. The file is only used for reading.
*/
    result = Curl_setstropt(&data->set.str[STRING_SSL_RANDOM_FILE],
                            va_arg(param, char *));
break;
case CURLOPT_EGDSOCKET:
/*
     * The Entropy Gathering Daemon socket pathname
*/
    result = Curl_setstropt(&data->set.str[STRING_SSL_EGDSOCKET],
                            va_arg(param, char *));
break;
case CURLOPT_MAXCONNECTS:
/*
     * Set the absolute number of maximum simultaneous alive connection that
     * libcurl is allowed to have.
*/
    result = Curl_ch_connc(data, data->state.connc, va_arg(param, long));
break;
case CURLOPT_FORBID_REUSE:
/*
     * When this transfer is done, it must not be left to be reused by a
     * subsequent transfer but shall be closed immediately.
*/
    data->set.reuse_forbid = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_FRESH_CONNECT:
/*
     * This transfer shall not use a previously cached connection but
     * should be made with a fresh new connect!
*/
    data->set.reuse_fresh = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_VERBOSE:
/*
     * Verbose means infof() calls that give a lot of information about
     * the connection and transfer procedures as well as internal choices.
*/
    data->set.verbose = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_HEADER:
/*
     * Set to include the header in the general data output stream.
*/
    data->set.include_header = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_NOPROGRESS:
/*
     * Shut off the internal supported progress meter
*/
    data->set.hide_progress = (bool)(0 != va_arg(param, long));
if(data->set.hide_progress)
      data->progress.flags |= PGRS_HIDE;
else
      data->progress.flags &= ~PGRS_HIDE;
break;
case CURLOPT_NOBODY:
/*
     * Do not include the body part in the output data stream.
*/
    data->set.opt_no_body = (bool)(0 != va_arg(param, long));
if(data->set.opt_no_body)
/* in HTTP lingo, this means using the HEAD request */
      data->set.httpreq = HTTPREQ_HEAD;
break;
case CURLOPT_FAILONERROR:
/*
     * Don't output the >=300 error code HTML-page, but instead only
     * return error.
*/
    data->set.http_fail_on_error = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_UPLOAD:
case CURLOPT_PUT:
/*
     * We want to sent data to the remote host. If this is HTTP, that equals
     * using the PUT request.
*/
    data->set.upload = (bool)(0 != va_arg(param, long));
if(data->set.upload)
/* If this is HTTP, PUT is what's needed to "upload" */
      data->set.httpreq = HTTPREQ_PUT;
break;
case CURLOPT_FILETIME:
/*
     * Try to get the file time of the remote document. The time will
     * later (possibly) become available using curl_easy_getinfo().
*/
    data->set.get_filetime = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_FTP_CREATE_MISSING_DIRS:
/*
     * An FTP option that modifies an upload to create missing directories on
     * the server.
*/
    data->set.ftp_create_missing_dirs = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_FTP_RESPONSE_TIMEOUT:
/*
     * An FTP option that specifies how quickly an FTP response must be
     * obtained before it is considered failure.
*/
    data->set.ftp_response_timeout = va_arg( param , long ) * 1000;
break;
case CURLOPT_DIRLISTONLY:
/*
     * An option that changes the command to one that asks for a list
     * only, no file info details.
*/
    data->set.ftp_list_only = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_APPEND:
/*
     * We want to upload and append to an existing file.
*/
    data->set.ftp_append = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_FTP_FILEMETHOD:
/*
     * How do access files over FTP.
*/
    data->set.ftp_filemethod = (curl_ftpfile)va_arg(param, long);
break;
case CURLOPT_NETRC:
/*
     * Parse the $HOME/.netrc file
*/
    data->set.use_netrc = (enum CURL_NETRC_OPTION)va_arg(param, long);
break;
case CURLOPT_NETRC_FILE:
/*
     * Use this file instead of the $HOME/.netrc file
*/
    result = Curl_setstropt(&data->set.str[STRING_NETRC_FILE],
                            va_arg(param, char *));
break;
case CURLOPT_TRANSFERTEXT:
/*
     * This option was previously named 'FTPASCII'. Renamed to work with
     * more protocols than merely FTP.
     *
     * Transfer using ASCII (instead of BINARY).
*/
    data->set.prefer_ascii = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_TIMECONDITION:
/*
     * Set HTTP time condition. This must be one of the defines in the
     * curl/curl.h header file.
*/
    data->set.timecondition = (curl_TimeCond)va_arg(param, long);
break;
case CURLOPT_TIMEVALUE:
/*
     * This is the value to compare with the remote document with the
     * method set with CURLOPT_TIMECONDITION
*/
    data->set.timevalue = (time_t)va_arg(param, long);
break;
case CURLOPT_SSLVERSION:
/*
     * Set explicit SSL version to try to connect with, as some SSL
     * implementations are lame.
*/
    data->set.ssl.version = va_arg(param, long);
break;

#ifndef CURL_DISABLE_HTTP
case CURLOPT_AUTOREFERER:
/*
     * Switch on automatic referer that gets set if curl follows locations.
*/
    data->set.http_auto_referer = (bool)(0 != va_arg(param, long));
break;

case CURLOPT_ENCODING:
/*
     * String to use at the value of Accept-Encoding header.
     *
     * If the encoding is set to "" we use an Accept-Encoding header that
     * encompasses all the encodings we support.
     * If the encoding is set to NULL we don't send an Accept-Encoding header
     * and ignore an received Content-Encoding header.
     *
*/
    argptr = va_arg(param, char *);
    result = Curl_setstropt(&data->set.str[STRING_ENCODING],
                            (argptr && !*argptr)?
                            (char *) ALL_CONTENT_ENCODINGS: argptr);
break;

case CURLOPT_FOLLOWLOCATION:
/*
     * Follow Location: header hints on a HTTP-server.
*/
    data->set.http_follow_location = (bool)(0 != va_arg(param, long));
break;

case CURLOPT_UNRESTRICTED_AUTH:
/*
     * Send authentication (user+password) when following locations, even when
     * hostname changed.
*/
    data->set.http_disable_hostname_check_before_authentication =
      (bool)(0 != va_arg(param, long));
break;

case CURLOPT_MAXREDIRS:
/*
     * The maximum amount of hops you allow curl to follow Location:
     * headers. This should mostly be used to detect never-ending loops.
*/
    data->set.maxredirs = va_arg(param, long);
break;

case CURLOPT_POST301:
/*
     * Obey RFC 2616/10.3.2 and resubmit a POST as a POST after a 301.
*/
    data->set.post301 = (bool)(0 != va_arg(param, long));
break;

case CURLOPT_POST:
/* Does this option serve a purpose anymore? Yes it does, when
       CURLOPT_POSTFIELDS isn't used and the POST data is read off the
       callback! */
if(va_arg(param, long)) {
      data->set.httpreq = HTTPREQ_POST;
      data->set.opt_no_body = FALSE; /* this is implied */
    }
else
      data->set.httpreq = HTTPREQ_GET;
break;

case CURLOPT_COPYPOSTFIELDS:
/*
     * A string with POST data. Makes curl HTTP POST. Even if it is NULL.
     * If needed, CURLOPT_POSTFIELDSIZE must have been set prior to
     *  CURLOPT_COPYPOSTFIELDS and not altered later.
*/
    argptr = va_arg(param, char *);

if (!argptr || data->set.postfieldsize == -1)
      result = Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], argptr);
else {
/*
       *  Check that requested length does not overflow the size_t type.
*/

if ((data->set.postfieldsize < 0) ||
          ((sizeof(curl_off_t) != sizeof(size_t)) &&
           (data->set.postfieldsize > (curl_off_t)((size_t)-1))))
        result = CURLE_OUT_OF_MEMORY;
else {
char * p;

        (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);

/* Allocate even when size == 0. This satisfies the need of possible
           later address compare to detect the COPYPOSTFIELDS mode, and
           to mark that postfields is used rather than read function or
           form data.
*/
        p = malloc((size_t)(data->set.postfieldsize?data->set.postfieldsize:1));

if (!p)
          result = CURLE_OUT_OF_MEMORY;
else {
if (data->set.postfieldsize)
            memcpy(p, argptr, data->set.postfieldsize);

          data->set.str[STRING_COPYPOSTFIELDS] = p;
        }
      }
    }

    data->set.postfields = data->set.str[STRING_COPYPOSTFIELDS];
    data->set.httpreq = HTTPREQ_POST;
break;

case CURLOPT_POSTFIELDS:
/*
     * Like above, but use static data instead of copying it.
*/
    data->set.postfields = va_arg(param, void *);
/* Release old copied data. */
    (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
    data->set.httpreq = HTTPREQ_POST;
break;

case CURLOPT_POSTFIELDSIZE:
/*
     * The size of the POSTFIELD data to prevent libcurl to do strlen() to
     * figure it out. Enables binary posts.
*/
    bigsize = va_arg(param, long);

if (data->set.postfieldsize < bigsize &&
        data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) {
/* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */
      (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
      data->set.postfields = NULL;
      }

    data->set.postfieldsize = bigsize;
break;

case CURLOPT_POSTFIELDSIZE_LARGE:
/*
     * The size of the POSTFIELD data to prevent libcurl to do strlen() to
     * figure it out. Enables binary posts.
*/
    bigsize = va_arg(param, curl_off_t);

if (data->set.postfieldsize < bigsize &&
        data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) {
/* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */
      (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
      data->set.postfields = NULL;
      }

    data->set.postfieldsize = bigsize;
break;

case CURLOPT_HTTPPOST:
/*
     * Set to make us do HTTP POST
*/
    data->set.httppost = va_arg(param, struct curl_httppost *);
    data->set.httpreq = HTTPREQ_POST_FORM;
    data->set.opt_no_body = FALSE; /* this is implied */
break;

case CURLOPT_REFERER:
/*
     * String to set in the HTTP Referer: field.
*/
if(data->change.referer_alloc) {
      free(data->change.referer);
      data->change.referer_alloc = FALSE;
    }
    result = Curl_setstropt(&data->set.str[STRING_SET_REFERER],
                            va_arg(param, char *));
    data->change.referer = data->set.str[STRING_SET_REFERER];
break;
/**中间省略10000行case情况,但都是想data数据修正值*/

default:
/* unknown tag and its companion, just ignore: */
    result = CURLE_FAILED_INIT; /* correct this */
break;
  }

return result;
}



3.真正发送请求:



CURLcode curl_easy_perform(CURL *easy)
{
  CURLM *multi;
  CURLMcode mcode;
  CURLcode code = CURLE_OK;
int still_running;
struct timeval timeout;
int rc;
  CURLMsg *msg;
  fd_set fdread;
  fd_set fdwrite;
  fd_set fdexcep;
int maxfd;

if(!easy)
return CURLE_BAD_FUNCTION_ARGUMENT;

  multi = curl_multi_init();
if(!multi)
return CURLE_OUT_OF_MEMORY;

  mcode = curl_multi_add_handle(multi, easy);
if(mcode) {
    curl_multi_cleanup(multi);
if(mcode == CURLM_OUT_OF_MEMORY)
return CURLE_OUT_OF_MEMORY;
else
return CURLE_FAILED_INIT;
  }

/* we start some action by calling perform right away */

do {
while(CURLM_CALL_MULTI_PERFORM ==
          curl_multi_perform(multi, &still_running));

if(!still_running)
break;

    FD_ZERO(&fdread);
    FD_ZERO(&fdwrite);
    FD_ZERO(&fdexcep);

/* timeout once per second */
    timeout.tv_sec = 1;
    timeout.tv_usec = 0;

/* Old deprecated style: get file descriptors from the transfers */
    curl_multi_fdset(multi, &fdread, &fdwrite, &fdexcep, &maxfd);
    rc = Curl_select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout);

/* The way is to extract the sockets and wait for them without using
       select. This whole alternative version should probably rather use the
       curl_multi_socket() approach. */

if(rc == -1)
/* select error */
break;

/* timeout or data to send/receive => loop! */
  } while(still_running);

  msg = curl_multi_info_read(multi, &rc);
if(msg)
    code = msg->data.result;

  mcode = curl_multi_remove_handle(multi, easy);
/* what to do if it fails? */

  mcode = curl_multi_cleanup(multi);
/* what to do if it fails? */

return code;
}



4.从内存去除申请的空间:



void curl_easy_cleanup(CURL *curl)
{
struct SessionHandle *data = (struct SessionHandle *)curl;

if(!data)
return;

  Curl_close(data);
}



 

php:

1.使用的数据结构:



typedef struct {
struct _php_curl_error   err;
struct _php_curl_free    *to_free;
struct _php_curl_send_headers header;
void ***thread_ctx;
    CURL                    *cp; /* php主要申请这个结构体,但这个结构体包含了C的CURL这个类型的结构体,所以可以采用ch->cp来设置这个结构体内容*/
    php_curl_handlers       *handlers;
long                     id;
    unsigned int             uses;
    zend_bool                in_callback;
    zval                     *clone;
} php_curl;



2. 使用的方法:



PHP_FUNCTION(curl_init)
{
    php_curl    *ch;
    CURL        *cp;
    zval        *clone;
    char        *url = NULL;
    int        url_len = 0;
    char *cainfo;

if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|s", &url, &url_len) == FAILURE) {
return;
    }

    cp = curl_easy_init();
if (!cp) {
        php_error_docref(NULL TSRMLS_CC, E_WARNING, "Could not initialize a new cURL handle");
        RETURN_FALSE;
    }

    alloc_curl_handle(&ch);
    TSRMLS_SET_CTX(ch->thread_ctx);

    ch->cp = cp;

    ch->handlers->write->method = PHP_CURL_STDOUT;
    ch->handlers->write->type   = PHP_CURL_ASCII;
    ch->handlers->read->method  = PHP_CURL_DIRECT;
    ch->handlers->write_header->method = PHP_CURL_IGNORE;

    ch->uses = 0;

    MAKE_STD_ZVAL(clone);
    ch->clone = clone;



    curl_easy_setopt(ch->cp, CURLOPT_NOPROGRESS,        1);
    curl_easy_setopt(ch->cp, CURLOPT_VERBOSE,           0);
    curl_easy_setopt(ch->cp, CURLOPT_ERRORBUFFER,       ch->err.str);
    curl_easy_setopt(ch->cp, CURLOPT_WRITEFUNCTION,     curl_write);
    curl_easy_setopt(ch->cp, CURLOPT_FILE,              (void *) ch);
    curl_easy_setopt(ch->cp, CURLOPT_READFUNCTION,      curl_read);
    curl_easy_setopt(ch->cp, CURLOPT_INFILE,            (void *) ch);
    curl_easy_setopt(ch->cp, CURLOPT_HEADERFUNCTION,    curl_write_header);
    curl_easy_setopt(ch->cp, CURLOPT_WRITEHEADER,       (void *) ch);
    curl_easy_setopt(ch->cp, CURLOPT_DNS_USE_GLOBAL_CACHE, 1);
    curl_easy_setopt(ch->cp, CURLOPT_DNS_CACHE_TIMEOUT, 120);
    curl_easy_setopt(ch->cp, CURLOPT_MAXREDIRS, 20); /* prevent infinite redirects */

    cainfo = INI_STR("curl.cainfo");
if (cainfo && strlen(cainfo) > 0) {
        curl_easy_setopt(ch->cp, CURLOPT_CAINFO, cainfo);
    }

#if defined(ZTS)
    curl_easy_setopt(ch->cp, CURLOPT_NOSIGNAL, 1);
#endif

if (url) {
if (!php_curl_option_url(ch, url, url_len)) {
            _php_curl_close_ex(ch TSRMLS_CC);
            RETURN_FALSE;
        }
    }

    ZEND_REGISTER_RESOURCE(return_value, ch, le_curl);
    ch->id = Z_LVAL_P(return_value);
}



执行真实下载 



PHP_FUNCTION(curl_exec)
{
    CURLcode    error;
    zval        *zid;
    php_curl    *ch;

if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) {
return;
    }

    ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl);

    _php_curl_verify_handlers(ch, 1 TSRMLS_CC);

    _php_curl_cleanup_handle(ch);

    error = curl_easy_perform(ch->cp);
    SAVE_CURL_ERROR(ch, error);
/* CURLE_PARTIAL_FILE is returned by HEAD requests */
if (error != CURLE_OK && error != CURLE_PARTIAL_FILE) {
if (ch->handlers->write->buf.len > 0) {
            smart_str_free(&ch->handlers->write->buf);
        }
        RETURN_FALSE;
    }

if (ch->handlers->std_err) {
        php_stream  *stream;
        stream = (php_stream*)zend_fetch_resource(&ch->handlers->std_err TSRMLS_CC, -1, NULL, NULL, 2, php_file_le_stream(), php_file_le_pstream());
if (stream) {
            php_stream_flush(stream);
        }
    }

if (ch->handlers->write->method == PHP_CURL_RETURN && ch->handlers->write->buf.len > 0) {
        smart_str_0(&ch->handlers->write->buf);
        RETURN_STRINGL(ch->handlers->write->buf.c, ch->handlers->write->buf.len, 1);
    }

/* flush the file handle, so any remaining data is synched to disk */
if (ch->handlers->write->method == PHP_CURL_FILE && ch->handlers->write->fp) {
fflush(ch->handlers->write->fp);
    }
if (ch->handlers->write_header->method == PHP_CURL_FILE && ch->handlers->write_header->fp) {
fflush(ch->handlers->write_header->fp);
    }

if (ch->handlers->write->method == PHP_CURL_RETURN) {
        RETURN_EMPTY_STRING();
    } else {
        RETURN_TRUE;
    }
}



关闭程序,清空内存



PHP_FUNCTION(curl_close)
{
    zval        *zid;
    php_curl    *ch;

if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) {
return;
    }

    ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl);

if (ch->in_callback) {
        php_error_docref(NULL TSRMLS_CC, E_WARNING, "Attempt to close cURL handle from a callback");
return;
    }

if (ch->uses) {
        ch->uses--;
    } else {
        zend_list_delete(Z_LVAL_P(zid));
    }
}



 

python的pycurl

1.使用的数据结构:



typedef struct {
    PyObject_HEAD
    PyObject *dict;                 /* Python attributes dictionary */
    CURL *handle;                   /*引用C的curl的数据结构*/
    PyThreadState *state;
    CurlMultiObject *multi_stack;
    CurlShareObject *share;
    struct curl_httppost *httppost;
    struct curl_slist *httpheader;
    struct curl_slist *http200aliases;
    struct curl_slist *quote;
    struct curl_slist *postquote;
    struct curl_slist *prequote;
    /* callbacks */
    PyObject *w_cb;
    PyObject *h_cb;
    PyObject *r_cb;
    PyObject *pro_cb;
    PyObject *debug_cb;
    PyObject *ioctl_cb;
    PyObject *opensocket_cb;
    /* file objects */
    PyObject *readdata_fp;
    PyObject *writedata_fp;
    PyObject *writeheader_fp;
    /* misc */
    void *options[OPTIONS_SIZE];    /* for OBJECTPOINT options */
    char error[CURL_ERROR_SIZE+1];
} CurlObject;



方法:

1.初始化对象:



static CurlObject *
do_curl_new(PyObject *dummy)
{
    CurlObject *self = NULL;
int res;
char *s = NULL;

    UNUSED(dummy);

/* Allocate python curl object */
    self = util_curl_new();
if (self == NULL)
return NULL;

/* Initialize curl handle */
    self->handle = curl_easy_init();
if (self->handle == NULL)
goto error;

/* Set curl error buffer and zero it */
    res = curl_easy_setopt(self->handle, CURLOPT_ERRORBUFFER, self->error);
if (res != CURLE_OK)
goto error;
    memset(self->error, 0, sizeof(self->error));

/* Set backreference */
    res = curl_easy_setopt(self->handle, CURLOPT_PRIVATE, (char *) self);
if (res != CURLE_OK)
goto error;

/* Enable NOPROGRESS by default, i.e. no progress output */
    res = curl_easy_setopt(self->handle, CURLOPT_NOPROGRESS, (long)1);
if (res != CURLE_OK)
goto error;

/* Disable VERBOSE by default, i.e. no verbose output */
    res = curl_easy_setopt(self->handle, CURLOPT_VERBOSE, (long)0);
if (res != CURLE_OK)
goto error;

/* Set FTP_ACCOUNT to NULL by default */
    res = curl_easy_setopt(self->handle, CURLOPT_FTP_ACCOUNT, NULL);
if (res != CURLE_OK)
goto error;

/* Set default USERAGENT */
    s = (char *) malloc(7 + strlen(LIBCURL_VERSION) + 1);
if (s == NULL)
goto error;
    strcpy(s, "PycURL/"); strcpy(s+7, LIBCURL_VERSION);
    res = curl_easy_setopt(self->handle, CURLOPT_USERAGENT, (char *) s); /*主要在这里调用c的curl的curl_easy_setopt方法,生成一个CURLsessionhandler结构体*/
if (res != CURLE_OK) {
        free(s);
goto error;
    }
    self->options[ OPT_INDEX(CURLOPT_USERAGENT) ] = s; s = NULL;

/* Success - return new object */
return self;

error:
    Py_DECREF(self);    /* this also closes self->handle */
    PyErr_SetString(ErrorObject, "initializing curl failed");
return NULL;
}



2.设置参数 



do_curl_setopt(CurlObject *self, PyObject *args)
{
int option;
    PyObject *obj;
int res;

if (!PyArg_ParseTuple(args, "iO:setopt", &option, &obj))
return NULL;
if (check_curl_state(self, 1 | 2, "setopt") != 0)
return NULL;

/* early checks of option value */
if (option <= 0)
goto error;
if (option >= (int)CURLOPTTYPE_OFF_T + OPTIONS_SIZE)
goto error;
if (option % 10000 >= OPTIONS_SIZE)
goto error;

#if 0 /* XXX - should we ??? */
/* Handle the case of None */
if (obj == Py_None) {
return util_curl_unsetopt(self, option);
    }
#endif

/* Handle the case of string arguments */
if (PyString_Check(obj)) {
char *str = NULL;
        Py_ssize_t len = -1;
char *buf;
int opt_index;

/* Check that the option specified a string as well as the input */
switch (option) {
case CURLOPT_CAINFO:
/*此处省略10000行,为pycurl未实现的curl的功能*/
case CURLOPT_CRLFILE:
case CURLOPT_ISSUERCERT:
/* FIXME: check if more of these options allow binary data */
            str = PyString_AsString_NoNUL(obj);
if (str == NULL)
return NULL;
break;
case CURLOPT_POSTFIELDS:
if (PyString_AsStringAndSize(obj, &str, &len) != 0)
return NULL;
/* automatically set POSTFIELDSIZE */
if (len <= INT_MAX) {
                res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE, (long)len); /*可以看到pycurl的设置参数也就是使用的c的curl的curl_easy_setopt,即是对C的curl的一种封装*/
            } else {
                res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE_LARGE, (curl_off_t)len);
            }
if (res != CURLE_OK) {
                CURLERROR_RETVAL();
            }
break;
default:
            PyErr_SetString(PyExc_TypeError, "strings are not supported for this option");
return NULL;
        }
/* Allocate memory to hold the string */
        assert(str != NULL);
if (len <= 0)
            buf = strdup(str);
else {
            buf = (char *) malloc(len);
if (buf) memcpy(buf, str, len);
        }
if (buf == NULL)
return PyErr_NoMemory();
/* Call setopt */
        res = curl_easy_setopt(self->handle, (CURLoption)option, buf);
/* Check for errors */
if (res != CURLE_OK) {
            free(buf);
            CURLERROR_RETVAL();
        }
/* Save allocated option buffer */
        opt_index = OPT_INDEX(option);
if (self->options[opt_index] != NULL) {
            free(self->options[opt_index]);
            self->options[opt_index] = NULL;
        }
        self->options[opt_index] = buf;
        Py_INCREF(Py_None);
return Py_None;
    }



3.关闭连接,或者说是删除内存中对象。



static PyObject *
do_curl_close(CurlObject *self)
{
if (check_curl_state(self, 2, "close") != 0) {
return NULL;
    }
    util_curl_close(self); /*删除了CurlObject对象*/
    Py_INCREF(Py_None);
return Py_None;
}



 

由以上分析可以看出,php的curl和python的curl都是对curl的一种封装,如果想写出一个更符合自己需求的配置型爬虫,可以考虑直接用C写,不过C的爬虫是不适合快速开发,这由代码量决定。

当然更好的建议是使用webkit做爬虫,作为部分浏览器内核,毋庸置疑。以后再说.