[M1 Mac, MacOS Ventura 13.3.1, clang 14.0.3]
tidy-html5やlibxml2といった有名どころのHTMLパーサーを使おうとしたものの、上手く使いこなせません。
文字列加工関数や正規表現を使ってスレッドIDとスレッドタイトルを2次元vectorにまとめました。
#include <iostream>
#include <string>
#include <vector>
#include <regex>
#include <curl/curl.h>
#include <iconv.h>
// コールバック関数
size_t WriteCallback(void* contents, size_t size, size_t nmemb, std::string* buffer) {
size_t totalSize = size * nmemb;
buffer->append((char*)contents, totalSize);
return totalSize;
}
std::string ConvertShiftJISToUTF8(const std::string& input) {
std::string output;
iconv_t cd = iconv_open("UTF-8", "CP932");
if (cd == (iconv_t)-1) {
std::cerr << "Error: Failed to open iconv" << std::endl;
return output;
}
size_t inBytes = input.size();
size_t outBytes = inBytes * 4; // 変換後の最大バイト数を予測して確保する
char* inBuf = const_cast<char*>(input.c_str());
char* outBuf = new char[outBytes];
char* outPtr = outBuf;
if (iconv(cd, &inBuf, &inBytes, &outPtr, &outBytes) == (size_t)-1) {
std::cerr << "Error: Failed to convert encoding: " << strerror(errno) << std::endl;
delete[] outBuf;
iconv_close(cd);
return output;
}
output.assign(outBuf, outPtr - outBuf);
delete[] outBuf;
iconv_close(cd);
return output;
}
int main() {
// URLからHTMLファイルを取り込む
std::string url = "HTMLファイルのURL";
std::string htmlBuffer;
CURL* curl = curl_easy_init();
if (curl) {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &htmlBuffer);
CURLcode res = curl_easy_perform(curl);
curl_easy_cleanup(curl);
if (res != CURLE_OK) {
std::cerr << "Error: Failed to download HTML" << std::endl;
return 1;
}
} else {
std::cerr << "Error: Failed to initialize CURL" << std::endl;
return 1;
}
// 文字コードをCP932(Microsoftの拡張Shift-JIS)からUTF-8へ変換
htmlBuffer = ConvertShiftJISToUTF8(htmlBuffer);
// htmlBufferの確認
// cout << "htmlBuffer: \n" << htmlBuffer << endl;
// smallタグ部分を抽出
std::string delimiter = "<small id=\"trad\">";
htmlBuffer = htmlBuffer.substr(htmlBuffer.find(delimiter) + delimiter.length());
delimiter = "</small>";
htmlBuffer = htmlBuffer.substr(0, htmlBuffer.find(delimiter));
// std::cout << "抽出内容:\n" << htmlBuffer << std::endl;
// 正規表現を使ってaタグ内のhrefとaタグの内容を抽出 (.*?)部分
std::regex pattern("<a.*?href=\"(.*?)/l50\".*?>(.*?)</a>");
std::smatch matches;
std::string::const_iterator searchStart(htmlBuffer.cbegin());
std::vector<std::pair<std::string, std::string>> idTitlePairs;
while (std::regex_search(searchStart, htmlBuffer.cend(), matches, pattern)) {
std::string id = matches[1];
std::string title = matches[2];
idTitlePairs.push_back(std::make_pair(id, title));
searchStart = matches.suffix().first;
}
for (const auto& pair : idTitlePairs) {
std::cout << "id: " << pair.first << std::endl;
std::cout << "title: " << pair.second << std::endl;
}
return 0;
}