完成
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import os
import csv
import sys
import argparse
#titleタグチェック
def check_p_title_tag(url):
try:
# URLからページの内容を取得
response = requests.get(url)
response.raise_for_status() # ステータスコードが200以外の場合に例外を発生させる
# ページの内容をBeautifulSoupオブジェクトに解析
soup = BeautifulSoup(response.content, 'html.parser')
# pタグでクラスがtitleのものが存在するかを確認
p_title_tags = soup.find_all('p', class_='title')
if p_title_tags:
#print(f"<p class='title'> タグが {len(p_title_tags)} 個見つかりました。")
return True
else:
#print("<p class='title'> タグは見つかりませんでした。")
return False
except requests.exceptions.RequestException as e:
print(f"リクエスト中にエラーが発生しました: {e}")
return False
#h4タグチェック
def check_h4_tag(url):
try:
# URLからページの内容を取得
response = requests.get(url)
response.raise_for_status() # ステータスコードが200以外の場合に例外を発生させる
# ページの内容をBeautifulSoupオブジェクトに解析
soup = BeautifulSoup(response.content, 'html.parser')
# h4タグが存在するかを確認
h4_tags = soup.find_all('h4')
if h4_tags:
#print(f"'h4' タグが {len(h4_tags)} 個見つかりました。")
return True
else:
#print("'h4' タグは見つかりませんでした。")
return False
except requests.exceptions.RequestException as e:
print(f"リクエスト中にエラーが発生しました: {e}")
return False
#title取得csv
def extract_content(url, page_title):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# docs-content-containerクラスの抽出
content_container = soup.find('div', {'class': 'docs-content-container'})
results = []
current_title = None
current_detail = []
for element in content_container.descendants:
if isinstance(element, NavigableString):
continue
#print(f"Element: {element.name}, Class: {element.get('class')}") # デバッグ用出力
if element.name == 'p' and element.get('class') and 'title' in element.get('class'):
if current_title:
results.append({
'url': url,
'page_title': page_title,
'section_title': current_title,
'details': '\n'.join(current_detail).strip()
})
current_title = element.text.strip()
current_detail = []
#print(f"New title found: {current_title}") # デバッグ用出力
elif current_title and element.name in ['p', 'pre', 'li']:
# pタグとliタグのみからテキストを抽出
text = element.get_text(strip=True)
if text:
current_detail.append(text)
# 最後の項目を追加
if current_title and current_detail:
results.append({
'url': url,
'page_title': page_title,
'section_title': current_title,
'details': '\n'.join(current_detail).strip()
})
return results
#all取得
def extract_content_from_page(url,page_title):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# docs-content-containerクラスの抽出
content_container = soup.find('div', {'class': 'docs-content-container'})
details = []
for element in content_container.descendants:
if isinstance(element, NavigableString):
continue
if element.name in ['p', 'pre', 'li']:
# pタグ、preタグ、liタグからテキストを抽出
text = element.get_text(strip=True)
if text:
details.append(text)
# 1つの結果としてまとめる
result = {
'url': url,
'page_title': page_title,
'section_title': page_title,
'details': '\n'.join(details).strip()
}
return [result] # リストとして返す(CSVへの書き込みを統一するため)
#h4取得
def extract_content_h4(url, page_title):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# docs-content-containerクラスの抽出
content_container = soup.find('div', {'class': 'docs-content-container'})
results = []
current_title = None
current_detail = []
for element in content_container.descendants:
if isinstance(element, NavigableString):
continue
#print(f"Element: {element.name}, Class: {element.get('class')}") # デバッグ用出力
if element.name == 'h4':
if current_title:
results.append({
'url': url,
'page_title': page_title,
'section_title': current_title,
'details': '\n'.join(current_detail).strip()
})
current_title = element.text.strip()
current_detail = []
print(f"New title found: {current_title}") # デバッグ用出力
elif current_title and element.name in ['p', 'pre', 'li']:
# pタグとliタグのみからテキストを抽出
text = element.get_text(strip=True)
if text:
current_detail.append(text)
# 最後の項目を追加
if current_title and current_detail:
results.append({
'url': url,
'page_title': page_title,
'section_title': current_title,
'details': '\n'.join(current_detail).strip()
})
return results
def save_to_csv(data, filename):
with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
fieldnames = ['url', 'page_title', 'section_title', 'details']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#writer.writeheader()
for row in data:
writer.writerow(row)
#メインページを取得
def scrape_redhat_docs(url):
#url = "https://docs.redhat.com/ja/documentation/red_hat_enterprise_linux/8/html/8.8_release_notes/index"
# ウェブページの取得
response = requests.get(url)
if response.status_code != 200:
print(f"Error fetching the page. Status code: {response.status_code}")
return
# BeautifulSoupでHTMLを解析
soup = BeautifulSoup(response.content, 'html.parser')
# chapterとsub-chapterクラスを持つli要素を取得
chapters = soup.find_all('li', class_=['chapter-title','chpter', 'sub-chapter'])
# 結果を格納するリスト
results = []
for chapter in chapters:
# aタグを探す
a_tag = chapter.find('a')
if a_tag:
title = a_tag.text
link = a_tag.get('href')
# リンクが相対パスの場合、完全なURLに変換
if link and not link.startswith('http'):
link = f"https://docs.redhat.com{link}"
results.append({'title': title, 'link': link})
return results
def main():
urls = ["https://docs.redhat.com/ja/documentation/red_hat_enterprise_linux/8/html/8.8_release_notes/index","https://docs.redhat.com/ja/documentation/red_hat_enterprise_linux/8/html/8.7_release_notes/index"]
filenames = ["RHEL8.8リリースノート","RHEL8.7リリースノート"]
# 基本となるパスの構成要素
base_path = r'C:\Users\'
extension = '.csv'
for i in range(len(urls)):
url = urls[i]
filename = filenames[i]
results = scrape_redhat_docs(url)
path = os.path.join(base_path, f"{filename}{extension}")
print(path)
if results:
for item in results:
stitle = str(item['title'])
slink = str(item['link'])
#print(stitle,slink)
if check_h4_tag(slink) == True:
print("h4")
result = extract_content_h4(slink,stitle)
save_to_csv(result,path)
elif check_p_title_tag(slink) == True:
print("title")
result = extract_content(slink,stitle)
save_to_csv(result,path)
else:
result = extract_content_from_page(slink,stitle)
save_to_csv(result,path)
print("other",stitle,slink)
else:
print("No results found or an error occurred.")
if __name__ == "__main__":
main()
参考1
import requests
from bs4 import BeautifulSoup
import csv
import sys
import argparse
#titleタグチェック
def check_p_title_tag(url):
try:
# URLからページの内容を取得
response = requests.get(url)
response.raise_for_status() # ステータスコードが200以外の場合に例外を発生させる
# ページの内容をBeautifulSoupオブジェクトに解析
soup = BeautifulSoup(response.content, 'html.parser')
# pタグでクラスがtitleのものが存在するかを確認
p_title_tags = soup.find_all('p', class_='title')
if p_title_tags:
print(f"<p class='title'> タグが {len(p_title_tags)} 個見つかりました。")
return True
else:
print("<p class='title'> タグは見つかりませんでした。")
return False
except requests.exceptions.RequestException as e:
print(f"リクエスト中にエラーが発生しました: {e}")
return False
#strongタグチェック
def check_strong_tag(url):
try:
# URLからページの内容を取得
response = requests.get(url)
response.raise_for_status() # ステータスコードが200以外の場合に例外を発生させる
# ページの内容をBeautifulSoupオブジェクトに解析
soup = BeautifulSoup(response.content, 'html.parser')
# strongタグが存在するかを確認
strong_tags = soup.find_all('strong')
if strong_tags:
print(f"'strong' タグが {len(strong_tags)} 個見つかりました。")
return True
else:
#print("'strong' タグは見つかりませんでした。")
return False
except requests.exceptions.RequestException as e:
#print(f"リクエスト中にエラーが発生しました: {e}")
return False
#h4タグチェック
def check_h4_tag(url):
try:
# URLからページの内容を取得
response = requests.get(url)
response.raise_for_status() # ステータスコードが200以外の場合に例外を発生させる
# ページの内容をBeautifulSoupオブジェクトに解析
soup = BeautifulSoup(response.content, 'html.parser')
# h4タグが存在するかを確認
h4_tags = soup.find_all('h4')
if h4_tags:
#print(f"'h4' タグが {len(h4_tags)} 個見つかりました。")
return True
else:
#print("'h4' タグは見つかりませんでした。")
return False
except requests.exceptions.RequestException as e:
#print(f"リクエスト中にエラーが発生しました: {e}")
return False
#title取得csv
def extract_content(url, page_title):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# docs-content-containerクラスの抽出
content_container = soup.find('div', {'class': 'docs-content-container'})
# 大項目の抽出
titles = [p.text for p in content_container.find_all('p', {'class': 'title'})]
# 詳細の抽出
details = []
current_detail = ""
for element in content_container.children:
if element.name == 'p' and element.get('class', None) == ['title']:
if current_detail:
details.append(current_detail.strip())
current_detail = ""
else:
current_detail += element.text + "\n"
if current_detail:
details.append(current_detail.strip())
results = []
for i, title in enumerate(titles):
if i < len(details):
detail = details[i]
else:
detail = "" # detailsに対応する要素がない場合は空文字列を使用
results.append({
'url': url,
'page_title': page_title,
'section_title': title,
'details': detail
})
return results
def extract_details(start_tag, end_tag):
details = ""
current_tag = start_tag.next_element
while current_tag != end_tag:
if current_tag.name:
if current_tag.name == 'br':
details += "\n"
elif current_tag.name in ['p', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span']:
details += current_tag.text.strip() + "\n"
current_tag = current_tag.next_element
return details.strip()
def scrape_redhat_docs_title(url, page_title):
# ウェブページの取得
response = requests.get(url)
if response.status_code != 200:
print(f"Error fetching the page. Status code: {response.status_code}")
return None
# BeautifulSoupでHTMLを解析
soup = BeautifulSoup(response.content, 'html.parser')
soup = soup.find('section', class_='rhdocs')
# p要素のclass属性が"title"のタグを全て取得
title_tags = soup.find_all('p', class_='title')
results = []
for i in range(len(title_tags)):
title = title_tags[i].text.strip()
if i < len(title_tags) - 1:
details = extract_details(title_tags[i], title_tags[i+1])
else:
details = extract_details(title_tags[i], None)
results.append({
'url': url,
'page_title': page_title,
'section_title': title,
'details': details
})
return results
#strong取得csv
def scrape_redhat_docs_strong(url, page_title):
# ウェブページの取得
response = requests.get(url)
if response.status_code != 200:
print(f"Error fetching the page. Status code: {response.status_code}")
return None
# BeautifulSoupでHTMLを解析
soup = BeautifulSoup(response.content, 'html.parser')
# strongタグを全て取得
strong_tags = soup.find_all('strong')
results = []
for i in range(len(strong_tags)):
title = strong_tags[i].text.strip()
# 次のstrongタグまでの全てのテキストを取得
details = ""
current_element = strong_tags[i].next_sibling
while current_element and (not isinstance(current_element, type(strong_tags[i])) or current_element.name != 'strong'):
if current_element.name and current_element.name not in ['script', 'style']:
details += current_element.text.strip() + " "
current_element = current_element.next_sibling
results.append({
'url': url,
'page_title': page_title,
'section_title': title,
'details': details.strip()
})
return results
#h4取得csv
def scrape_redhat_docs_h4(url, page_title):
# ウェブページの取得
response = requests.get(url)
if response.status_code != 200:
print(f"Error fetching the page. Status code: {response.status_code}")
return None
# BeautifulSoupでHTMLを解析
soup = BeautifulSoup(response.content, 'html.parser')
# h4タグを全て取得
h4_tags = soup.find_all('h4')
results = []
for i in range(len(h4_tags)):
title = h4_tags[i].text.strip()
# 次のh4タグまでの全てのテキストを取得
details = ""
current_element = h4_tags[i].next_sibling
while current_element and (not isinstance(current_element, type(h4_tags[i])) or current_element.name != 'h4'):
if current_element.name and current_element.name not in ['script', 'style']:
details += current_element.text.strip() + " "
current_element = current_element.next_sibling
results.append({
'url': url,
'page_title': page_title,
'section_title': title,
'details': details.strip()
})
return results
def save_to_csv(data, filename):
with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
fieldnames = ['url', 'page_title', 'section_title', 'details']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
#メインページを取得
def scrape_redhat_docs():
url = "https://docs.redhat.com/ja/documentation/red_hat_enterprise_linux/8/html/8.8_release_notes/index"
# ウェブページの取得
response = requests.get(url)
if response.status_code != 200:
print(f"Error fetching the page. Status code: {response.status_code}")
return
# BeautifulSoupでHTMLを解析
soup = BeautifulSoup(response.content, 'html.parser')
# chapterとsub-chapterクラスを持つli要素を取得
chapters = soup.find_all('li', class_=['chapter-title','chpter', 'sub-chapter'])
# 結果を格納するリスト
results = []
for chapter in chapters:
# aタグを探す
a_tag = chapter.find('a')
if a_tag:
#title = a_tag.text.strip()
title = a_tag.text
link = a_tag.get('href')
# リンクが相対パスの場合、完全なURLに変換
if link and not link.startswith('http'):
link = f"https://docs.redhat.com{link}"
results.append({'title': title, 'link': link})
return results
def main():
results = scrape_redhat_docs()
path = r'C:\Users\shota\rhel.csv'
if results:
for item in results:
stitle = str(item['title'])
slink = str(item['link'])
print(stitle,slink)
if check_h4_tag(slink) == True:
print("h4")
#result = scrape_redhat_docs_h4(slink,stitle)
#save_to_csv(result,path)
elif check_p_title_tag(slink) == True:
print("title")
result = extract_content(slink,stitle)
#result = scrape_redhat_docs_title(slink,stitle)
save_to_csv(result,path)
else:
print("other")
'''
#strongで取得
elif check_strong_tag(slink) == True:
print("strong")
result = scrape_redhat_docs_strong(slink,stitle)
save_to_csv(result,path)
'''
else:
print("No results found or an error occurred.")
if __name__ == "__main__":
main()
参考2
def extract_content(url, page_title):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# docs-content-containerクラスの抽出
content_container = soup.find('div', {'class': 'docs-content-container'})
# 大項目の抽出
titles = [p.text for p in content_container.find_all('p', {'class': 'title'})]
# 詳細の抽出
details = []
current_detail = ""
for p in content_container.find_all('p'):
if p.get('class', None) == ['title']:
if current_detail:
details.append(current_detail.strip())
current_detail = ""
else:
current_detail += p.text + "\n"
if current_detail:
details.append(current_detail.strip())
return titles, details, page_title
コメント