物种分类查询API

# 背景

通过NCBI toxo库可以查询物种分类，但有时候ncbi打开很慢，并且不是很好嵌入到脚本中使用。

虽然可以通过taxonomy ftp站点下载库文件，解析对应文件拿到物种分类，但nodes和name两个文件加起来也几百兆，一个小脚本带着这么几百兆的文件似乎也不方便。

通过fastAPI+mySQL做一个自己web API提供服务。

# 数据整理

先从ftp站点下载对应压缩包，其中 nodes.dmp和names.dmp内容及格式如下

# nodes.dmp

This file represents taxonomy nodes. The description for each node includes the following fields:

tax_id					-- node id in GenBank taxonomy database
parent tax_id				-- parent node id in GenBank taxonomy database
rank					-- rank of this node (superkingdom, kingdom, ...) 
embl code				-- locus-name prefix; not unique
division id				-- see division.dmp file
inherited div flag  (1 or 0)		-- 1 if node inherits division from parent
genetic code id				-- see gencode.dmp file
inherited GC  flag  (1 or 0)		-- 1 if node inherits genetic code from parent
mitochondrial genetic code id		-- see gencode.dmp file
inherited MGC flag  (1 or 0)		-- 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0)            -- 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag (1 or 0)       -- 1 if this subtree has no sequence data yet
comments				-- free-text comments and citations

# names.dmp

Taxonomy names file has these fields:

tax_id					-- the id of node associated with this name
name_txt				-- name itself
unique name				-- the unique variant of this name if name not unique
name class				-- (synonym, common name, ...)

# 信息解析

通过读取nodes中对应关系和names中的名称，将上述两个文件转换为一张物种表格。写一个小的python脚本进行转换。

import sys
import os

tax_sp = {}
tax_parent = {}
tax_name = {}
tax_sci_name = {}

# 定义全局变量
global sp_list
global class_list


def recur_tax(tax_par,tax_sci_name,tax_sp,taxid):
	if taxid == '1':
		return sp_list,class_list
	sp_list.append(tax_sci_name[taxid])
	class_list.append(tax_sp[taxid])
	taxid = tax_par[taxid]
	return recur_tax(tax_par,tax_sci_name,tax_sp,taxid)

with open('nodes.dmp') as f:
	for line in f:
		tmp = line.strip().split('|')
		tax_sp[tmp[0].strip()] = tmp[2].strip()
		tax_parent[tmp[0].strip()] = tmp[1].strip()


with open('names.dmp') as f:
	for line in f:
		tmp = line.strip().split('|')
		if tmp[0].strip() not in tax_name.keys():
			tax_name[tmp[0].strip()] = []
		tax_name[tmp[0].strip()].append(tmp[1].strip())
		if tmp[3].strip() == 'scientific name':
			if tmp[0].strip() not in tax_sci_name.keys():
				tax_sci_name[tmp[0].strip()] = tmp[1].strip()


with open('tax.txt','w') as f:
	for key in tax_sp.keys():
		sp_list = []
		class_list = []
		tmp_sp_list,tmp_class_list = recur_tax(tax_parent,tax_sci_name,tax_sp,key)
		#if 'Bacteria' not in tmp_sp_list:
		for name in set(tax_name[key]):
			f.write('%s\t%s\t%s\t%s\n'%(name,key,','.join(tmp_sp_list),','.join(tmp_class_list)))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48

似乎由于递归堆栈的关系，python2要比python3运行慢很多，使用python3运行该脚本。

# 数据入mySQL库

得到的tax.txt即是一张大表，通过pymysql相关函数操作mySQL数据库，将对应结果存入数据库。

import pymysql

conn = pymysql.connect(host='',user='',passwd='',db='taxon',charset="utf8",port=3306)

cursor = conn.cursor()

cursor.execute("""create table if not exists taxon_class(
	id int primary key auto_increment not null,
	search_name varchar(255) ,
	taxid int,
	tax_name varchar(5000),
	tax_class varchar(5000))engine=InnoDB default charset=utf8;""")

cursor.execute('ALTER TABLE taxon_class ADD INDEX search_name_index(search_name);')

with open('tax.txt') as f:
	f.readline()
	f.readline()
	insert_sql = "insert into taxon_class(search_name,taxid,tax_name,tax_class)" + ' values(%s)'%(','.join(['%s']*4))
	data_list = []
	for line in f:
		value = tuple(line.strip().split('\t'))
		data_list.append(value)
	cursor.executemany(insert_sql, data_list)


#提交
conn.commit()

#关闭游标
cursor.close()

#关闭连接
conn.close()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

# 数据库查询

避免每次查询都需要对数据库进行连接，由此带来不必要的性能损耗，使用dbutils提供的工具创建连接池。

import pymysql
from dbutils.pooled_db import PooledDB
 
class MysqlPool:
    config = {
        'creator': pymysql,
        'host': " ",
        'port':  ,
        'user': " ",
        'password': " ",
        'db': "taxon",
        'charset': 'utf8',
        'maxconnections': 10,  # 连接池最大连接数量
        'cursorclass': pymysql.cursors.DictCursor
    }
    pool = PooledDB(**config)
 
    def __enter__(self):
        self.conn = MysqlPool.pool.connection()
        self.cursor = self.conn.cursor()
        return self
 
    def __exit__(self, type, value, trace):
        self.cursor.close()
        self.conn.close()
 
def db_conn(func):
    def wrapper(*args, **kw):
        with MysqlPool() as db:
            result = func(db, *args, **kw)
        return result
    return wrapper
 

@db_conn
def search_sp(db,sp):
    q = "SELECT * FROM taxon.taxon_class WHERE search_name = '%s';"%sp
    db.cursor.execute(q)
    result = db.cursor.fetchall()
    return result

print(search_sp('human'))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

返回结果如下：

[{'id': 3619413, 'search_name': 'human', 'taxid': 9606, 'tax_name': 'Homo sapiens,Homo,Homininae,Hominidae,Hominoidea,Catarrhini,Simiiformes,Haplorrhini,Primates,Euarchontoglires,Boreoeutheria,Eutheria,Theria,Mammalia,Amniota,Tetrapoda,Dipnotetrapodomorpha,Sarcopterygii,Euteleostomi,Teleostomi,Gnathostomata,Vertebrata,Craniata,Chordata,Deuterostomia,Bilateria,Eumetazoa,Metazoa,Opisthokonta,Eukaryota,cellular organisms', 'tax_class': 'species,genus,subfamily,family,superfamily,parvorder,infraorder,suborder,order,superorder,clade,clade,clade,class,clade,clade,clade,superclass,clade,clade,clade,clade,subphylum,phylum,clade,clade,clade,kingdom,clade,superkingdom,no rank'}]

fastAPI使用的是ORM模型，使用sqlalchemy进行查询

# fastAPI使用

参考

建立如下目录结构：

.
└── taxon_api
    ├── __init__.py
    ├── crud.py
    ├── database.py
    ├── main.py
    ├── models.py
    └── schemas.py

1
2
3
4
5
6
7
8

init.py 是一个空文件，但是说明sql_app是一个package
database.py 数据库配置相关
models.py 数据库模型表
schemas.py 模型验证
crud.py 数据库操作相关
main.py 主文件

database.py

from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

SQLALCHEMY_DATABASE_URL = "mysql+pymysql://root:123456@localhost:3306/taxon"

# echo=True表示引擎将用repr()函数记录所有语句及其参数列表到日志
engine = create_engine(
    SQLALCHEMY_DATABASE_URL, encoding='utf8', echo=True
)

# SQLAlchemy中，CRUD是通过会话进行管理的，所以需要先创建会话，
# 每一个SessionLocal实例就是一个数据库session
# flush指发送到数据库语句到数据库，但数据库不一定执行写入磁盘
# commit是指提交事务，将变更保存到数据库文件中
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

# 创建基本映射类
Base = declarative_base()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

models.py

from sqlalchemy import Boolean, Column, Integer, String
from database import Base


class Taxon_class(Base):
    __tablename__ = "taxon_class"
    id = Column(Integer, primary_key=True, index=True)
    search_name = Column(String(255),index=True)
    taxid = Column(Integer)
    tax_name = Column(String(5000))
    tax_class = Column(String(5000))

1
2
3
4
5
6
7
8
9
10
11

crud.py

from sqlalchemy.orm import Session
import models


def search(db: Session, sp:str):
    return db.query(models.Taxon_class).filter(models.Taxon_class.search_name == sp).first()

1
2
3
4
5
6

main.py

from fastapi import FastAPI, Depends, HTTPException
import crud
from database import SessionLocal, engine, Base
from sqlalchemy.orm import Session
import uvicorn

Base.metadata.create_all(bind=engine) #数据库初始化，如果没有库或者表，会自动创建

app = FastAPI()


# Dependency
def get_db():
    """
    每一个请求处理完毕后会关闭当前连接，不同的请求使用不同的连接
    :return:
    """
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()



@app.get("/taxon/")
async def read_taxon(sp: str = 'human',db: Session = Depends(get_db)):
    db_sp = crud.search(db, sp=sp)
    if not db_sp:
        raise HTTPException(status_code=404, detail="Species Not Found")
    return db_sp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

# 运行fastAPI

使用uvicorn运行程序

uvicorn --host ip --port port --reload main:app

BioNote

Choose mode