Depedency

!pip install simplejson

Requirement already satisfied: simplejson in c:\users\zarathu09\anaconda3\envs\zarathu\lib\site-packages (3.18.3)

from datetime import datetime
import os
import sys
import urllib.request
import pandas as pd 
import json
import re 
import requests
import simplejson

API Keys

API KEY 발급받는 방법은 링크를 참조해주세요

네이버 https://zerosecu.tistory.com/18

일 허용 한도 25000건

카카오 https://kadosholy.tistory.com/25
구글 https://gomgomi.tistory.com/3

일일 검색어 제한 10,000개

# Naver_client_id = 
# Naver_client_secret = 
# Kakao_API_key= 
# Google_SEARCH_ENGINE_ID = 
# Google_API_KEY =

지식인, 블로그, 동영상, pdf 파일, book, 신문기사 제외 링크에 포함되어선 안될 도메인을 제거해줍니다!

추가하실 도메인을 넣어주세요

Trash_Link = ["tistory", "kin", "youtube", "blog", "book", "news", "dcinside", "fmkorea", "ruliweb", "theqoo", "clien", "mlbpark", "instiz", "todayhumor"]

GOOGLE API

def Google_API(query, wanted_row):
    query= query.replace("|","OR")
    query += "-filetype:pdf"
    start_pages=[]

    df_google= pd.DataFrame(columns=['Title','Link','Description'])

    row_count =0 


    for i in range(1,wanted_row+1000,10):
        start_pages.append(i)

    for start_page in start_pages:
        url = f"https://www.googleapis.com/customsearch/v1?key={Google_API_KEY}&cx={Google_SEARCH_ENGINE_ID}&q={query}&start={start_page}"
        data = requests.get(url).json()
        search_items = data.get("items")
        
        try:
            for i, search_item in enumerate(search_items, start=1):
                # extract the page url
                link = search_item.get("link")
                if any(trash in link for trash in Trash_Link):
                    pass
                else: 
                    # get the page title
                    title = search_item.get("title")
                    # page snippet
                    descripiton = search_item.get("snippet")
                    # print the results
                    df_google.loc[start_page + i] = [title,link,descripiton] 
                    row_count+=1
                    if (row_count >= wanted_row) or (row_count == 300) :
                        return df_google
        except:
            return df_google

    
    return df_google

NAVER API

def Naver_API(query,wanted_row):
    query = urllib.parse.quote(query)

    display=100
    start=1
    end=wanted_row+10000
    idx=0
    sort='sim'

    df= pd.DataFrame(columns=['Title','Link','Description'])
    row_count= 0 
    
    for start_index in range(start,end,display):
        url = "https://openapi.naver.com/v1/search/webkr?query="+ query +\
            "&display=" + str(display)+ \
            "&start=" + str(start_index) + \
            "&sort=" + sort
        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",Naver_client_id)
        request.add_header("X-Naver-Client-Secret",Naver_client_secret)
        try:
            response = urllib.request.urlopen(request)
            rescode = response.getcode()
            if(rescode==200):
                response_body = response.read()
                items= json.loads(response_body.decode('utf-8'))['items']
                remove_tag = re.compile('<.*?>')
                for item_index in range(0,len(items)):
                    link = items[item_index]['link']
                    if any(trash in link for trash in Trash_Link):
                        idx+=1
                        pass
                    else:
                        title = re.sub(remove_tag, '', items[item_index]['title'])
                        description = re.sub(remove_tag, '', items[item_index]['description'])
                        df.loc[idx] =[title,link,description]
                        idx+=1
                        row_count+=1
                        if (row_count >= wanted_row) or (row_count == 300):
                            return df
                        
        except:
            return df

DAUM KAKAO API

def Daum_API(query,wanted_row):
    pages= wanted_row//10 

    method = "GET"
    url = "https://dapi.kakao.com/v2/search/web"
    header = {'authorization': f'KakaoAK {Kakao_API_key}'}

    df= pd.DataFrame(columns=['Title','Link','Description'])

    row_count=0

    for page in range(1,pages+10):
        params = {'query' : query, 'page' : page}
        request = requests.get( url, params= params, headers=header )
        for i, item in enumerate(request.json()["documents"], start=1):
            link = item['url']
            try:
                written_year=int(item['datetime'][:4])
            except:
                written_year = 2023

            if (any(trash in link for trash in Trash_Link) or (written_year <2020)):
                pass
            else:
                title= item["title"]
                description = item["contents"]
                df.loc[10*page+i] =[title,link,description]
                row_count+=1
                if (row_count >= wanted_row) or (row_count == 300):
                    remove_tag = re.compile('<.*?>')
                    df['Title'] =df['Title'].apply(lambda x :re.sub(remove_tag, '',x))
                    df['Description'] =df['Description'].apply(lambda x :re.sub(remove_tag, '',x))

                    return df
                

    remove_tag = re.compile('<.*?>')
    df['Title'] =df['Title'].apply(lambda x :re.sub(remove_tag, '',x))
    df['Description'] =df['Description'].apply(lambda x :re.sub(remove_tag, '',x))
    
    return df

최종코드

today = datetime.today().strftime("%Y%m%d")
today

'20230210'

def final(query,wanted_row=100):
    df_google = Google_API(query,wanted_row)
    df_google['search_engine']='Google'
    df_naver = Naver_API(query,wanted_row)
    df_naver['search_engine']='Naver'
    df_daum = Daum_API(query,wanted_row)
    df_daum['search_engine']='Daum'
    df_final= pd.concat([df_google,df_naver,df_daum])
    df_final['search_date'] = today
    df_final.reset_index(inplace=True,drop=True)
    
    return df_final

############### 검색할 검색어를 query에, 검색엔진당 추출할 문서의 개수를 wanted_row 에 적어주세요####################

query = "뇌진탕 | 외상성 뇌손상 | 두부외상"
wanted_row = 100

df = final(query=query, wanted_row=wanted_row)

df

	Title	Link	Description	search_engine	search_date
0	외상에 의한 뇌 손상 \| 질환백과 \| 의료정보 \| 건강정보 \| 서울아산병원	https://www.amc.seoul.kr/asan/mobile/healthinf...	최근에는 광범위한 뇌진탕의 정의에 의식 소실이 없는 경우도 포함합니다. ... 교통...	Google	20230210
1	외상성 뇌 손상 \| Disability Rights California	https://www.disabilityrightsca.org/ko/publicat...	팻비 프로그램 이 프로그램은 정보, 의뢰, 봉사 활동, 훈련, 자기 옹호 지원, 개...	Google	20230210
2	외상성 뇌 손상: Let's Help You - Washington 211	https://wa211.org/ko/%EC%99%B8%EC%83%81%EC%84%...	Washington 211은 워싱턴주 사회보건서비스부와 협력하여 외상성 뇌손상(TB...	Google	20230210
3	Traumatic Brain Injury / Concussion \| Concussi...	https://www.cdc.gov/traumaticbraininjury/index...	A traumatic brain injury, or TBI, is an injury...	Google	20230210
4	외상성 뇌손상: 위원회 결정의 근거	https://www.fcps.edu/sites/default/files/media...	학생 이름. 학생 번호. 회의 날짜. 정의: 외상성 뇌손상이란 외부의 물리적 힘에 ...	Google	20230210
...	...	...	...	...	...
239	대한진단검사의학회	https://www.kslm.org/sub07/studying_total.html...	각각의 특징을 열거한다. 2621 (4) 기타 손상 뇌좌상을 정의하고, 각각의 특징...	Daum	20230210
240	두통, 오심을 주소로 하는 뇌진탕후증후군 (postconcussion syndrom...	https://www.jikm.or.kr/journal/scholar_xmls.ph...	부종을 일으킬 수 있다13. 뇌진탕(concussion) 혹은 경미한 두부 외상(M...	Daum	20230210
241	신경외과	http://gbh.or.kr/hosp/hosp03_03.html	뇌졸중, 뇌출혈, 뇌경색 등 뇌혈관 질환 뇌, 척추의 외상성 손상 간질, 뇌종양 척...	Daum	20230210
242	안내	https://kangnam.hallym.or.kr/hallymuniv_sub.as...	이름, 전문진료분야 이름 전문진료분야 이호국 교수상세보기 진료예약 진료예약 뇌졸중(...	Daum	20230210
243	의료진/진료시간표	https://yumc.ac.kr:8443/medical/timetable.do;j...	및 외상성 척추질환, 척수신경종양, 척추감염, 척추관련통증 교수 오전 인터넷예약 간...	Daum	20230210

244 rows × 5 columns

df['search_engine'].value_counts()

Naver     99
Daum      79
Google    66
Name: search_engine, dtype: int64

#저장할때 파일명에는 특수문자 제거 
query_filename =  re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", query)
df.to_csv(f'{query_filename}_{today}.csv',index=False)

'뇌진탕  외상성 뇌손상  두부외상'

Citation

BibTeX citation:

@online{untitled,
  author = {},
  title = {Depedency},
  url = {https://blog.zarathu.com/source_code/Custom_Search_Zarathu.html},
  langid = {en}
}

For attribution, please cite this work as:

“Depedency.” n.d. https://blog.zarathu.com/source_code/Custom_Search_Zarathu.html.