serious-code.net  ³×À̹ö»çÀüµ¥ÀÌÅͱܾî¿À±â UserPreferences
 
Private HelpContents Search Diffs Info Edit Subscribe XML Print View
ȸ»ç ÀÏ ¶§¹®¿¡ ³×À̹ö¿¡¼­ »çÀü µ¥ÀÌÅ͸¦ ±Ü¾î¿À°Ô µÇ¾ú´Ù. HTML Çü½Äµµ ´ë°­ ´Ù ºñ½ÁÇϰí ÇØ¼­ ÇϳªÀÇ ÇÁ·Î±×·¥À¸·Î ÇÑ¿µ,¿µÇÑ,ÇÑÀÏ,ÀÏÇÑ,ÇÑÇÑ »çÀü µîÀ» ¸ðµÎ ±Ü¾î¿Ã ¼ö ÀÖ¾ú´Ù. ¾Æ·¡´Â ±× ÇÁ·Î±×·¥ÀÇ ¼Ò½º´Ù. ´ëºÎºÐÀÇ ¼Ò½º´Â RetrievingFileUsingHttp ÆäÀÌÁö¿¡ ÀÖ´Â ¼Ò½º¿Í ¶È°°Àº ¼Ò½º´Ù. ´Ù¸¸ Àü¼ÛµÈ HTML¿¡¼­ ÀÌ·±Àú·± ű׸¦ ¾ø¾ÖÁÖ´Â ºÎºÐÀÌ ³ªÁß¿¡ ¶Ç ©·Á¸é ±ÍÂúÀ» °Í °°¾Æ¼­ ÀÌ·¸°Ô ³²°ÜµÐ´Ù.



#include "MTypes.h"
#include <conio.h>
#include <stdio.h>
#include <fcntl.h>
#include <io.h>
#include <winsock.h>
#include <fstream>
#include "MUtil.h"

void GetHTTP(LPCSTR lpServerName, LPCSTR lpFileName, string& text);

// Helper macro for displaying errors
#define PRINTERROR(s)   \
        fprintf(stderr,"\n%: %d\n", s, WSAGetLastError())

void main(int argc, char **argv)
{
    WORD wVersionRequested = MAKEWORD(1,1);
    WSADATA wsaData;
    int nRet;

    if (argc != 3)
    {
        cerr << "Usage: " << argv[0] << " START_INDEX END_INDEX" << endl;
        return;
    }

    int start_index = atoi(argv[1]);
    int end_index = atoi(argv[2]);

    if (start_index > end_index)
    {
        cout << "END_INDEX must be larger than START_INDEX!" << endl;
        return;
    }

    nRet = WSAStartup(wVersionRequested, &wsaData);
    if (nRet)
    {
        fprintf(stderr,"\nWSAStartup(): %d\n", nRet);
        WSACleanup();
        return;
    }
    
    if (wsaData.wVersion != wVersionRequested)
    {
        fprintf(stderr,"\nWinSock version not supported\n");
        WSACleanup();
        return;
    }

    char buf[1024] = {0, };

    string text;
    string title;

    sprintf(buf, "download_%d_%d.txt", start_index, end_index);

    ofstream file(buf, ios::out | ios::trunc);

    //string token1 = "<a class=hb";
    //string token2 = "</small>";

    string token1 = "<!-- content -->";
    string token2 = "<!-- end of content -->";

    size_t begin = 0;
    size_t end = 0;

    text.reserve(1024*100);

    for (int i=start_index; i<=end_index; i++)
    {
        memset(buf, 0, 1024);
        sprintf(buf, "/endic.php?docid=%d", i);

        //http://endic.naver.com/endic.php?docid=135905

        GetHTTP("endic.naver.com", buf, text);

        begin = text.find("<title>", 0);
        end = text.find("</title>", begin);

        if (begin < end && end != string::npos)
        {
            title = text.substr(begin + 7 + 25, end - begin - 7 - 25 - 1);
        }

        begin = text.find(token1, 0);
        end   = text.find(token2, begin);

        if (begin < end && end != string::npos)
        {
            file << i << "  ==================================================" 
                << endl << title << " | ";

            text = text.substr(begin, end - begin + token2.size());

            //file << text.substr(begin, end - begin + 8) << endl;

            size_t i = 0;
            size_t j = 0;
            size_t k = 0;

            //              k
            // i            j  i  j
            // 0123456789012345678901234567890123456789
            // <a href="...">hm</a><a href="...">...</a>

            while (i < text.size() && j < text.size())
            {
                i = text.find_first_of('<', k);
                if (i == string::npos) break;

                j = text.find_first_of('>', i);
                if (j == string::npos) break;

                if (k < i)
                {
                    string subtext = text.substr(k, i-k);
                    if (!subtext.empty()) file << subtext;
                }

                k = j + 1;
            }

            file << endl;
        }

        cout << i << endl;
    }

    WSACleanup();
}

void GetHTTP(LPCSTR lpServerName, LPCSTR lpFileName, string& text)
{
    IN_ADDR     iaHost;
    LPHOSTENT   lpHostEntry;

    iaHost.s_addr = inet_addr(lpServerName);
    if (iaHost.s_addr == INADDR_NONE)
    {
        // Wasn't an IP address string, assume it is a name
        lpHostEntry = gethostbyname(lpServerName);
    }
    else
    {
        // It was a valid IP address string
        lpHostEntry = gethostbyaddr((const char *)&iaHost, 
                        sizeof(struct in_addr), AF_INET);
    }
    if (lpHostEntry == NULL)
    {
        PRINTERROR("gethostbyname()");
        return;
    }

    SOCKET  Socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
    if (Socket == INVALID_SOCKET)
    {
        PRINTERROR("socket()"); 
        return;
    }

    LPSERVENT lpServEnt;
    SOCKADDR_IN saServer;

    lpServEnt = getservbyname("http", "tcp");
    if (lpServEnt == NULL)
        saServer.sin_port = htons(80);
    else
        saServer.sin_port = lpServEnt->s_port;

    saServer.sin_family = AF_INET;
    saServer.sin_addr = *((LPIN_ADDR)*lpHostEntry->h_addr_list);

    int nRet = connect(Socket, (LPSOCKADDR)&saServer, sizeof(SOCKADDR_IN));
    if (nRet == SOCKET_ERROR)
    {
        PRINTERROR("connect()");
        closesocket(Socket);
        return;
    }

    static char szBuffer[1024*100] = {0, };

    memset(szBuffer, 0, sizeof(szBuffer));
    sprintf(szBuffer, "GET %s\n", lpFileName);
    nRet = send(Socket, szBuffer, strlen(szBuffer), 0);
    if (nRet == SOCKET_ERROR)
    {
        PRINTERROR("send()");
        closesocket(Socket);    
        return;
    }

    text = "";

    while(1)
    {
        memset(szBuffer, 0, sizeof(szBuffer));

        // Wait to receive, nRet = NumberOfBytesReceived
        nRet = recv(Socket, szBuffer, sizeof(szBuffer), 0);
        if (nRet == SOCKET_ERROR)
        {
            PRINTERROR("recv()");
            break;
        }

        //fprintf(stderr, " == %d bytes\n", nRet);

        // Did the server close the connection?
        if (nRet == 0)
        {
            break;
        }

        text += string(szBuffer);
    }

    closesocket(Socket);    
}



PythonPowered
FindPage by browsing, title search , text search or an index
Or try one of these actions: AttachFile, DeletePage, LikePages, LocalSiteMap, RenamePage, SpellCheck
SeriousMoin v1 (koMoinMoin 1.0a4 Modified)