RHELリリースノートスクレイピング

完成
参考1
参考2

完成

import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import os
import csv
import sys
import argparse

#titleタグチェック
def check_p_title_tag(url):
    try:
        # URLからページの内容を取得
        response = requests.get(url)
        response.raise_for_status()  # ステータスコードが200以外の場合に例外を発生させる

        # ページの内容をBeautifulSoupオブジェクトに解析
        soup = BeautifulSoup(response.content, 'html.parser')

        # pタグでクラスがtitleのものが存在するかを確認
        p_title_tags = soup.find_all('p', class_='title')
        if p_title_tags:
            #print(f"<p class='title'> タグが {len(p_title_tags)} 個見つかりました。")
            return True
        else:
            #print("<p class='title'> タグは見つかりませんでした。")
            return False

    except requests.exceptions.RequestException as e:
        print(f"リクエスト中にエラーが発生しました: {e}")
        return False



#h4タグチェック
def check_h4_tag(url):
    try:
        # URLからページの内容を取得
        response = requests.get(url)
        response.raise_for_status()  # ステータスコードが200以外の場合に例外を発生させる

        # ページの内容をBeautifulSoupオブジェクトに解析
        soup = BeautifulSoup(response.content, 'html.parser')

        # h4タグが存在するかを確認
        h4_tags = soup.find_all('h4')
        if h4_tags:
            #print(f"'h4' タグが {len(h4_tags)} 個見つかりました。")
            return True
        else:
            #print("'h4' タグは見つかりませんでした。")
            return False

    except requests.exceptions.RequestException as e:
        print(f"リクエスト中にエラーが発生しました: {e}")
        return False

#title取得csv
def extract_content(url, page_title):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # docs-content-containerクラスの抽出
    content_container = soup.find('div', {'class': 'docs-content-container'})

    results = []
    current_title = None
    current_detail = []

    for element in content_container.descendants:
        if isinstance(element, NavigableString):
            continue
        
        #print(f"Element: {element.name}, Class: {element.get('class')}")  # デバッグ用出力
        
        if element.name == 'p' and element.get('class') and 'title' in element.get('class'):
            if current_title:
                results.append({
                    'url': url,
                    'page_title': page_title,
                    'section_title': current_title,
                    'details': '\n'.join(current_detail).strip()
                })
            current_title = element.text.strip()
            current_detail = []
            #print(f"New title found: {current_title}")  # デバッグ用出力
        elif current_title and element.name in ['p', 'pre', 'li']:
            # pタグとliタグのみからテキストを抽出
            text = element.get_text(strip=True)
            if text:
                current_detail.append(text)

    # 最後の項目を追加
    if current_title and current_detail:
        results.append({
            'url': url,
            'page_title': page_title,
            'section_title': current_title,
            'details': '\n'.join(current_detail).strip()
        })

    return results

#all取得
def extract_content_from_page(url,page_title):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # docs-content-containerクラスの抽出
    content_container = soup.find('div', {'class': 'docs-content-container'})
    details = []

    for element in content_container.descendants:
        if isinstance(element, NavigableString):
            continue
        
        if element.name in ['p', 'pre', 'li']:
            # pタグ、preタグ、liタグからテキストを抽出
            text = element.get_text(strip=True)
            if text:
                details.append(text)

    # 1つの結果としてまとめる
    result = {
        'url': url,
        'page_title': page_title,
        'section_title': page_title,
        'details': '\n'.join(details).strip()
    }

    return [result]  # リストとして返す（CSVへの書き込みを統一するため）


#h4取得
def extract_content_h4(url, page_title):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # docs-content-containerクラスの抽出
    content_container = soup.find('div', {'class': 'docs-content-container'})

    results = []
    current_title = None
    current_detail = []

    for element in content_container.descendants:
        if isinstance(element, NavigableString):
            continue
        
        #print(f"Element: {element.name}, Class: {element.get('class')}")  # デバッグ用出力
        
        if element.name == 'h4':
            if current_title:
                results.append({
                    'url': url,
                    'page_title': page_title,
                    'section_title': current_title,
                    'details': '\n'.join(current_detail).strip()
                })
            current_title = element.text.strip()
            current_detail = []
            print(f"New title found: {current_title}")  # デバッグ用出力
        elif current_title and element.name in ['p', 'pre', 'li']:
            # pタグとliタグのみからテキストを抽出
            text = element.get_text(strip=True)
            if text:
                current_detail.append(text)

    # 最後の項目を追加
    if current_title and current_detail:
        results.append({
            'url': url,
            'page_title': page_title,
            'section_title': current_title,
            'details': '\n'.join(current_detail).strip()
        })

    return results


def save_to_csv(data, filename):
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['url', 'page_title', 'section_title', 'details']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        #writer.writeheader()
        for row in data:
            writer.writerow(row)


#メインページを取得
def scrape_redhat_docs(url):
    #url = "https://docs.redhat.com/ja/documentation/red_hat_enterprise_linux/8/html/8.8_release_notes/index"
    
    # ウェブページの取得
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching the page. Status code: {response.status_code}")
        return

    # BeautifulSoupでHTMLを解析
    soup = BeautifulSoup(response.content, 'html.parser')

    # chapterとsub-chapterクラスを持つli要素を取得
    chapters = soup.find_all('li', class_=['chapter-title','chpter', 'sub-chapter'])

    # 結果を格納するリスト
    results = []

    for chapter in chapters:
        # aタグを探す
        a_tag = chapter.find('a')
        if a_tag:
            title = a_tag.text
            link = a_tag.get('href')
            # リンクが相対パスの場合、完全なURLに変換
            if link and not link.startswith('http'):
                link = f"https://docs.redhat.com{link}"
            results.append({'title': title, 'link': link})

    return results

def main():
    urls = ["https://docs.redhat.com/ja/documentation/red_hat_enterprise_linux/8/html/8.8_release_notes/index","https://docs.redhat.com/ja/documentation/red_hat_enterprise_linux/8/html/8.7_release_notes/index"]
    filenames = ["RHEL8.8リリースノート","RHEL8.7リリースノート"]
    
    # 基本となるパスの構成要素
    base_path = r'C:\Users\'
    extension = '.csv'
    
    for i in range(len(urls)):
        url = urls[i]
        filename = filenames[i]
        results = scrape_redhat_docs(url)
        path = os.path.join(base_path, f"{filename}{extension}")
        print(path)
        if results:
            for item in results:
                stitle = str(item['title'])
                slink = str(item['link'])
                #print(stitle,slink)
                if check_h4_tag(slink) == True:
                    print("h4")
                    result = extract_content_h4(slink,stitle)
                    save_to_csv(result,path)
                elif check_p_title_tag(slink) == True:
                    print("title")
                    result = extract_content(slink,stitle)
                    save_to_csv(result,path)
                else:
                    result = extract_content_from_page(slink,stitle)
                    save_to_csv(result,path)
                    print("other",stitle,slink)
        else:
            print("No results found or an error occurred.")
     
if __name__ == "__main__":
    main()

参考1

import requests
from bs4 import BeautifulSoup
import csv
import sys
import argparse

#titleタグチェック
def check_p_title_tag(url):
    try:
        # URLからページの内容を取得
        response = requests.get(url)
        response.raise_for_status()  # ステータスコードが200以外の場合に例外を発生させる

        # ページの内容をBeautifulSoupオブジェクトに解析
        soup = BeautifulSoup(response.content, 'html.parser')

        # pタグでクラスがtitleのものが存在するかを確認
        p_title_tags = soup.find_all('p', class_='title')
        if p_title_tags:
            print(f"<p class='title'> タグが {len(p_title_tags)} 個見つかりました。")
            return True
        else:
            print("<p class='title'> タグは見つかりませんでした。")
            return False

    except requests.exceptions.RequestException as e:
        print(f"リクエスト中にエラーが発生しました: {e}")
        return False


#strongタグチェック
def check_strong_tag(url):
    try:
        # URLからページの内容を取得
        response = requests.get(url)
        response.raise_for_status()  # ステータスコードが200以外の場合に例外を発生させる

        # ページの内容をBeautifulSoupオブジェクトに解析
        soup = BeautifulSoup(response.content, 'html.parser')

        # strongタグが存在するかを確認
        strong_tags = soup.find_all('strong')
        if strong_tags:
            print(f"'strong' タグが {len(strong_tags)} 個見つかりました。")
            return True
        else:
            #print("'strong' タグは見つかりませんでした。")
            return False

    except requests.exceptions.RequestException as e:
        #print(f"リクエスト中にエラーが発生しました: {e}")
        return False


#h4タグチェック
def check_h4_tag(url):
    try:
        # URLからページの内容を取得
        response = requests.get(url)
        response.raise_for_status()  # ステータスコードが200以外の場合に例外を発生させる

        # ページの内容をBeautifulSoupオブジェクトに解析
        soup = BeautifulSoup(response.content, 'html.parser')

        # h4タグが存在するかを確認
        h4_tags = soup.find_all('h4')
        if h4_tags:
            #print(f"'h4' タグが {len(h4_tags)} 個見つかりました。")
            return True
        else:
            #print("'h4' タグは見つかりませんでした。")
            return False

    except requests.exceptions.RequestException as e:
        #print(f"リクエスト中にエラーが発生しました: {e}")
        return False

#title取得csv
def extract_content(url, page_title):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # docs-content-containerクラスの抽出
    content_container = soup.find('div', {'class': 'docs-content-container'})

    # 大項目の抽出
    titles = [p.text for p in content_container.find_all('p', {'class': 'title'})]

    # 詳細の抽出
    details = []
    current_detail = ""
    for element in content_container.children:
        if element.name == 'p' and element.get('class', None) == ['title']:
            if current_detail:
                details.append(current_detail.strip())
            current_detail = ""
        else:
            current_detail += element.text + "\n"
    if current_detail:
        details.append(current_detail.strip())
    
    results = []
    for i, title in enumerate(titles):
        if i < len(details):
            detail = details[i]
        else:
            detail = ""  # detailsに対応する要素がない場合は空文字列を使用

        results.append({
            'url': url,
            'page_title': page_title,
            'section_title': title,
            'details': detail
        })

    return results
    
    
def extract_details(start_tag, end_tag):
    details = ""
    current_tag = start_tag.next_element
    while current_tag != end_tag:
        if current_tag.name:
            if current_tag.name == 'br':
                details += "\n"
            elif current_tag.name in ['p', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span']:
                details += current_tag.text.strip() + "\n"
        current_tag = current_tag.next_element
    return details.strip()

def scrape_redhat_docs_title(url, page_title):
    # ウェブページの取得
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching the page. Status code: {response.status_code}")
        return None

    # BeautifulSoupでHTMLを解析
    soup = BeautifulSoup(response.content, 'html.parser')
    soup = soup.find('section', class_='rhdocs')

    # p要素のclass属性が"title"のタグを全て取得
    title_tags = soup.find_all('p', class_='title')

    results = []
    for i in range(len(title_tags)):
        title = title_tags[i].text.strip()
        if i < len(title_tags) - 1:
            details = extract_details(title_tags[i], title_tags[i+1])
        else:
            details = extract_details(title_tags[i], None)
        results.append({
            'url': url,
            'page_title': page_title,
            'section_title': title,
            'details': details
        })

    return results

#strong取得csv
def scrape_redhat_docs_strong(url, page_title):
    # ウェブページの取得
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching the page. Status code: {response.status_code}")
        return None

    # BeautifulSoupでHTMLを解析
    soup = BeautifulSoup(response.content, 'html.parser')

    # strongタグを全て取得
    strong_tags = soup.find_all('strong')

    results = []
    for i in range(len(strong_tags)):
        title = strong_tags[i].text.strip()
        
        # 次のstrongタグまでの全てのテキストを取得
        details = ""
        current_element = strong_tags[i].next_sibling
        while current_element and (not isinstance(current_element, type(strong_tags[i])) or current_element.name != 'strong'):
            if current_element.name and current_element.name not in ['script', 'style']:
                details += current_element.text.strip() + " "
            current_element = current_element.next_sibling
        
        results.append({
            'url': url,
            'page_title': page_title,
            'section_title': title,
            'details': details.strip()
        })

    return results

#h4取得csv
def scrape_redhat_docs_h4(url, page_title):
    # ウェブページの取得
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching the page. Status code: {response.status_code}")
        return None

    # BeautifulSoupでHTMLを解析
    soup = BeautifulSoup(response.content, 'html.parser')

    # h4タグを全て取得
    h4_tags = soup.find_all('h4')

    results = []
    for i in range(len(h4_tags)):
        title = h4_tags[i].text.strip()
        
        # 次のh4タグまでの全てのテキストを取得
        details = ""
        current_element = h4_tags[i].next_sibling
        while current_element and (not isinstance(current_element, type(h4_tags[i])) or current_element.name != 'h4'):
            if current_element.name and current_element.name not in ['script', 'style']:
                details += current_element.text.strip() + " "
            current_element = current_element.next_sibling
        
        results.append({
            'url': url,
            'page_title': page_title,
            'section_title': title,
            'details': details.strip()
        })

    return results

def save_to_csv(data, filename):
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['url', 'page_title', 'section_title', 'details']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in data:
            writer.writerow(row)


#メインページを取得
def scrape_redhat_docs():
    url = "https://docs.redhat.com/ja/documentation/red_hat_enterprise_linux/8/html/8.8_release_notes/index"
    
    # ウェブページの取得
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching the page. Status code: {response.status_code}")
        return

    # BeautifulSoupでHTMLを解析
    soup = BeautifulSoup(response.content, 'html.parser')

    # chapterとsub-chapterクラスを持つli要素を取得
    chapters = soup.find_all('li', class_=['chapter-title','chpter', 'sub-chapter'])

    # 結果を格納するリスト
    results = []

    for chapter in chapters:
        # aタグを探す
        a_tag = chapter.find('a')
        if a_tag:
            #title = a_tag.text.strip()
            title = a_tag.text
            link = a_tag.get('href')
            # リンクが相対パスの場合、完全なURLに変換
            if link and not link.startswith('http'):
                link = f"https://docs.redhat.com{link}"
            results.append({'title': title, 'link': link})

    return results

def main():
    results = scrape_redhat_docs()
    path = r'C:\Users\shota\rhel.csv'
    
    if results:
        for item in results:
            stitle = str(item['title'])
            slink = str(item['link'])
            print(stitle,slink)
            if check_h4_tag(slink) == True:
                print("h4")
                #result = scrape_redhat_docs_h4(slink,stitle)
                #save_to_csv(result,path)
            elif check_p_title_tag(slink) == True:
                print("title")
                result = extract_content(slink,stitle)
                
                #result = scrape_redhat_docs_title(slink,stitle)
                save_to_csv(result,path)
            else:
                print("other")
            '''
            #strongで取得
            elif check_strong_tag(slink) == True:
                print("strong")
                result = scrape_redhat_docs_strong(slink,stitle)
                save_to_csv(result,path)
            '''

    else:
        print("No results found or an error occurred.")

if __name__ == "__main__":
    main()

参考2

def extract_content(url, page_title):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # docs-content-containerクラスの抽出
    content_container = soup.find('div', {'class': 'docs-content-container'})

    # 大項目の抽出
    titles = [p.text for p in content_container.find_all('p', {'class': 'title'})]

    # 詳細の抽出
    details = []
    current_detail = ""
    for p in content_container.find_all('p'):
        if p.get('class', None) == ['title']:
            if current_detail:
                details.append(current_detail.strip())
            current_detail = ""
        else:
            current_detail += p.text + "\n"
    if current_detail:
        details.append(current_detail.strip())

    return titles, details, page_title

RHELリリースノート取得サンプル

完成

参考1

参考2

コメント