🧪

【ケモインフォマティクス事例】PubChemに登録されている化合物リストの取得する

yunosuke

2023/11/30に公開

pubchem

ケモインフォマティクス

tech

PubChemには膨大な量の化合物の情報が保存されています。
本記事では、その膨大な化合物のリストを取得する方法について説明します。

pubchemの化合物の情報が入ったSDFファイルのDLリンクを取得する

今回取得する化合物のリストはSDFファイルに保存されています。

SDFファイル（Structure-Data File）は、化学情報学や薬物デザインなどの分野で広く使用されるファイル形式の一つで、化学物質の構造情報と関連データを格納するために設計されたテキストベースのファイルフォーマットです。以下にSDFファイルの主な特徴と用途を説明します：

構造情報: 化学物質の分子構造情報を格納するために使用され、分子の原子、結合、立体構造、および部分構造（官能基、環状構造など）に関する詳細な情報が含まれています。この情報は、分子の三次元構造を表示および解析するのに使用されます。
関連データ: 化合物に関連するさまざまなデータを格納するためのフィールドを持っており、化合物の名前、化学式、分子量、融点、沸点、溶解度、毒性情報、生物活性データなどが含まれます。関連データは、データベース検索や薬物設計に役立ちます。
多様な用途: 新しい薬物候補のデザイン、化合物ライブラリーの管理、薬物スクリーニング、バーチャルスクリーニング、化学情報の交換、結晶構造データベースの構築など、化学情報管理と化学データの共有のさまざまな用途に使用されます。

SDFファイルは、化学情報の共有と交換のための標準的なフォーマットの一つであり、多くの化学データベースやソフトウェアツールでサポートされています。化学データの取り扱いや分析に関連する多くの作業でSDFファイルが使用され、化学者や薬物研究者にとって重要なツールとなっています。

このSDFファイルをダウンロードすつための関数が以下になります。

from bs4 import BeautifulSoup
import requests
import os, re

def pubchemUrlList() -> list:
    """
    pubchemの特定のサイトからsdf.gzのurlを取得する。
    """
    url = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    urls = []
    for row in soup.find_all("a"):
        href = row.get("href")
        if href.endswith(".sdf.gz"):
            urls.append(url+href)
    return urls

def pubchemModifiedDict() -> dict:
    """
    pubchemの特定のサイトからファイル名とlast_modifiedを取得する。
    """
    url = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # textからYYYY-MM-DD HH:MMの形式の文字列を抽出するlambda関数
    pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}'
    get_date = lambda text: re.search(pattern, text).group() if re.search(pattern, text) else None

    rows = soup.find('pre').text.split('\n')
    return {r.split(' ')[0]:get_date(r) for r in rows}

def pubchemDownloadList() -> list:
    """
    pubchemの特定のサイトからファイル名、url、last_modified、db名を取得する。
    """
    db = "pubchem"
    urls = pubchemUrlList()
    modifids = pubchemModifiedDict()
    return [[os.path.basename(u), u, modifids[os.path.basename(u)], db] for u in urls]

pubchemDownloadList() の実行結果

[['Compound_000000001_000500000.sdf.gz',
  'https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/Compound_000000001_000500000.sdf.gz',
  '2023-07-26 00:57',
  'pubchem'],
 ['Compound_000500001_001000000.sdf.gz',
  'https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/Compound_000500001_001000000.sdf.gz',
  '2023-10-04 01:38',
  'pubchem'],
 ['Compound_001000001_001500000.sdf.gz',
...
  'pubchem'],
 ['Compound_168500001_169000000.sdf.gz',
  'https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/Compound_168500001_169000000.sdf.gz',
  '2023-10-09 04:14',
  'pubchem']]

[ファイル名, url, 更新日, ソース] が二次元リストで返される。
2023/10/10時点で取得できるファイルの数は338件。

URLからファイルをDLする

以下の関数で実行する。

import requests, traceback, os, shutil, pathlib, gzip, math
from tqdm import tqdm

def fetch(url: str, filename:str ='', directory:str ='') -> bool:
    """
    url先のファイルをダウンロードします。

    Parameters
    ----------
    url : str
    ダウンロードしたいファイルのURL

    filename : str, default ''
    保存するファイルの名称。指定していない場合、urlの末尾になる。

    directory : str, default ''
    保存先のディレクトリ。指定しない場合、プログラムファイルと同一の階層に保存。

    Returns
    -------
    bool
    fetchに成功するとTrue。失敗するとFalseを返す。
    """
    try:
        response = requests.head(url, allow_redirects=True)
        size = response.headers.get('content-length', -1)
        print('size', convertSize(int(size)))
        if filename == '':
          filename=os.path.basename(url)

        save_name = filename
        if str(directory) != '':
          save_name = os.path.join(str(directory), filename)
        if os.path.exists(save_name):
          print('already exists')
          return save_name
        r = requests.get(url, stream=True)
        with open(save_name, 'wb') as f:
          with tqdm(total=int(size), desc='download', unit='B', unit_scale=True, leave=False, ncols=100) as pbar:
            for chunk in r.iter_content(chunk_size=1024):
              if chunk:
                f.write(chunk)
                f.flush()
                pbar.update(len(chunk))
        return save_name
    except Exception as e:
        print(traceback.format_exc())
        return False

ファイルをダウンロードする。

urls= pubchemDownloadList()

# urls[0]
# >> 
# ['Compound_000000001_000500000.sdf.gz',
#  'https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/Compound_000000001_000500000.sdf.gz',
#  '2023-07-26 00:57',
#  'pubchem']

url = urls[0]
file_path = fetch(url[1], url[0])

カレントディレクトリに Compound_000000001_000500000.sdf.gzがダウンロードされ、file_pathにダウンロードされたファイルのパスが入る。

DLしたファイルからCIDとSMILESを取得する

CIDは化合物毎に登録されているpubchemの固有ID。

取得するための関数。

import gzip, traceback

def splitMolBlock_exist(file):
    """
    .sdfまたは.sdf.gzファイルからCIDとSMILESを取得するジェネレータ。
    """
    gzip_file = True
    if file.endswith(".sdf.gz"):
        fi = gzip.open(file, 'r')
        gzip_file = True
    elif file.endswith(".sdf"):
        fi = open(file, 'r')
        gzip_file = False
    else:
        raise Exception("file extension is not sdf or sdf.gz")
    count = 0
    i = 1
    try:
        end_flag = True
        cid_flag = False
        smiles_flag = False
        for line in fi:
            # print(line)
            if gzip_file:
                line = line.decode('utf-8')

            if cid_flag:
                cid = int(line)
                cid_flag = False
            if smiles_flag:
                smiles = line.replace('\n', '')
                smiles_flag = False

            if 'PUBCHEM_COMPOUND_CID' in line:
                cid_flag = True

            if 'PUBCHEM_OPENEYE_CAN_SMILES' in line:
                smiles_flag = True

            if line[:4] == "$$$$":
                # if not cid in db_ids:
                yield cid, smiles
                count += 1
                i += 1

        if end_flag:
            print(f"block {count} is done")
            yield cid, smiles
            print("done")
    except Exception as e:
        print(traceback.format_exc())
        pass
    fi.close()

CIDとSMILESを取得してcsvに書き込む。

import csv

# csvファイルに書き込む
with open('test.csv', 'w') as f:
    writer = csv.writer(f, lineterminator='\n') # 改行コード（\n）を指定しておく
    writer.writerow(['CID', 'SMILES'])     # list（1次元配列）の場合
    for row in splitMolBlock_exist(file_path):
        writer.writerow(row)     # list（1次元配列）の場合

書き込まれたcsvの中身

CID,SMILES
1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
2,CC(=O)OC(CC(=O)O)C[N+](C)(C)C
3,C1=CC(C(C(=C1)C(=O)O)O)O
4,CC(CN)O
5,C(C(=O)COP(=O)(O)O)N
6,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
7,CCN1C=NC2=C(N=CN=C21)N
...

以上でPubchemに登録されている化合物のリストを取得することができた。

株式会社piponでは技術でお困りのことがある方はオンライン相談が可能です。
こちらから会社概要資料をDLできます！
お問い合わせ内容に「オンライン相談希望」とご記載ください。

株式会社piponでは定期的に技術勉強会を開催しています。
ChatGPT・AI・データサイエンスについてご興味がある方は是非、ご参加ください。
https://chatgptllm.connpass.com/

株式会社piponではChatGPT・AI・データサイエンスについて業界ごとの事例を紹介しています。ご興味ある方はこちらのオウンドメディアをご覧ください。
https://bigdata-tools.com/

株式会社piponのテックブログPublication

株式会社piponのテックブログです。 ChatGPTやAzureをメインに情報発信していきます！お問い合わせはフォームへお願いします。会社HP pipon.co.jp/ フォーム share.hsforms.com/19XNce4U5TZuPebGH_BB9Igegfgt

pubchemの化合物の情報が入ったSDFファイルのDLリンクを取得する

URLからファイルをDLする

DLしたファイルからCIDとSMILESを取得する

Discussion