📚
wikipedia dumpをGCSにアップロード
wikipedia dump
ライブラリ
!pip install --upgrade google-cloud-storage
!pip install tqdm
!pip install wget
Goolge認証
#Goolge認証:GCSにアクセスするため
from google.colab import auth
auth.authenticate_user()
wiki dumpをダウンロード、GCSにアップロード
from tqdm import tqdm
import requests
import wget
from google.cloud import storage
# Google Cloud Storageのクライアントを初期化
client = storage.Client()
bucket_name = 'wikipedia_data_hub' # ここにGCSのバケット名を記入
bucket = client.get_bucket(bucket_name)
# wikipedia
file_wikipedia_urls = [
"https://dumps.wikimedia.org/jawiki/20241020/jawiki-20241020-pages-meta-current.xml.bz2"
]
file_wikipedia_names = [
"jawiki-20241020-pages-meta-current.xml.bz2"
]
#wikibook
file_wikibook_urls =[
"https://dumps.wikimedia.org/jawikibooks/20241020/jawikibooks-20241020-pages-meta-current.xml.bz2"
]
file_wikibook_names =[
"jawikibooks-20241020-pages-meta-current.xml.bz2"
]
auth.authenticate_user()
# GCSのクライアント設定
storage_client = storage.Client()
def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
"""Uploads a file to Google Cloud Storage."""
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
print(f"File {source_file_name} uploaded to {destination_blob_name}.")
#wikipediaの場合、
for url,file_name in zip(file_wikipedia_urls,file_wikipedia_names):
# ファイルをダウンロード
url = base_url + file_name
wget.download(url)
# GCSにアップロード
upload_to_gcs(bucket_name, file_name, f"xml_wikipedia/{file_name}")
Discussion