2021年7月15日 – mowareのブログ

seleniumの動きがもっさりしているため、C/C++のlibcurlというライブラリで高速化を試みました。スクレイピング先のhtmlをまるごとダウンロードして解析し、テキストを抽出してcsvファイルにまとめるという流れです。

その結果、こちらが引くくらい速くなったので全編C言語で書く必要はなくなり、html解析はPythonライブラリのBeautifulSoupで実施しました。スクレイピング先に負荷をかけないようtime.sleepでループの速度を遅くしています。

C言語の方は参考サイトのコードに少しだけ手を入れて完成させました。

今思えばここまでやらなくてもRequests + BeautifulSoupであればPythonだけで高速化できていたかもしれません。時間があれば検証します。

21/07/17追記：C言語にこだわらなければ、以下のコマンドをsubprocessモジュールで走らせてhtmlをダウンロードするのが最も簡単だと思います。
curl “スクレイピング先のurl” > 出力先

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>

#define EXIT_SUCCESS 0

FILE *fp; // 入力ファイル
FILE *fp2; // 出力ファイル
char fname[] = "horse_url.txt";
char fname2[] = "curl.html";
char buffer[100];
char horse_url[100];

struct Buffer {
    char *data;
    int data_size;
};

size_t buffer_writer(char *ptr, size_t size, size_t nmemb, void *stream) {
    struct Buffer *buf = (struct Buffer *)stream;
    int block = size * nmemb;
    if (!buf) {
        return block;
    }

    if (!buf->data) {
        buf->data = (char *)malloc(block);
    }
    else {
        buf->data = (char *)realloc(buf->data, buf->data_size + block);
    }

    if (buf->data) {
        memcpy(buf->data + buf->data_size, ptr, block);
        buf->data_size += block;
    }

    return block;
}

int main(void) {

    CURL *curl;
    struct Buffer *buf;

    buf = (struct Buffer *)malloc(sizeof(struct Buffer));
    buf->data = NULL;
    buf->data_size = 0;

    fp = fopen(fname, "r");
	while(fgets(buffer,100, fp) != NULL ) {
        sscanf(buffer,"%s",horse_url);
    }
    fclose(fp);

    curl = curl_easy_init();
    curl_easy_setopt(curl, CURLOPT_URL, horse_url);
    curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
    curl_easy_setopt(curl, CURLOPT_WRITEDATA, buf);
    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, buffer_writer);

    curl_easy_perform(curl);
    curl_easy_cleanup(curl);

    fp2 = fopen(fname2, "w");
    fprintf(fp2, "%s", buf->data);
    fclose(fp2);

    free(buf->data);
    free(buf);

    return EXIT_SUCCESS;
}

＜C言語コンパイル＆実行とBeautifulSoup処理の箇所のみ＞

proc = subprocess.run("gcc [ソースコード] -I/usr/local/opt/curl/include -lcurl ; ./a.out ; ECHO 'C言語実行完了'", shell=True, stdout= subprocess.PIPE, stderr = subprocess.PIPE)
print(proc.stdout.decode('UTF-8'))

with open(html,encoding='EUC-JP',errors='ignore') as f:
    contents= f.read()
soup = BeautifulSoup(contents, "html.parser")

参考サイト

日	月	火	水	木	金	土
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31

日: 2021年7月15日

[Python] 284 C言語併用によるスクレイピング高速化 libcurl