📚

wikipedia dumpをGCSにアップロード

2024/11/02に公開

wikipedia dump

wikibooks
wikipedia

ライブラリ

!pip install --upgrade google-cloud-storage
!pip install tqdm
!pip install wget

Goolge認証

#Goolge認証:GCSにアクセスするため
from google.colab import auth
auth.authenticate_user()

wiki dumpをダウンロード、GCSにアップロード

from tqdm import tqdm
import requests
import wget
from google.cloud import storage

# Google Cloud Storageのクライアントを初期化
client = storage.Client()
bucket_name = 'wikipedia_data_hub'  # ここにGCSのバケット名を記入
bucket = client.get_bucket(bucket_name)
# wikipedia
file_wikipedia_urls = [
   "https://dumps.wikimedia.org/jawiki/20241020/jawiki-20241020-pages-meta-current.xml.bz2"
]
file_wikipedia_names = [
    "jawiki-20241020-pages-meta-current.xml.bz2"
]
#wikibook
file_wikibook_urls =[
    "https://dumps.wikimedia.org/jawikibooks/20241020/jawikibooks-20241020-pages-meta-current.xml.bz2"
]
file_wikibook_names =[
    "jawikibooks-20241020-pages-meta-current.xml.bz2"
]

auth.authenticate_user()
# GCSのクライアント設定
storage_client = storage.Client()

def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to Google Cloud Storage."""
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

#wikipediaの場合、
for url,file_name in zip(file_wikipedia_urls,file_wikipedia_names):
    # ファイルをダウンロード
    url = base_url + file_name
    wget.download(url)
    # GCSにアップロード
    upload_to_gcs(bucket_name, file_name, f"xml_wikipedia/{file_name}")

Discussion