物种分类查询API
BioNote 2022-08-28
SQL
web programming
# 背景
通过NCBI toxo库可以查询物种分类,但有时候ncbi打开很慢,并且不是很好嵌入到脚本中使用。
虽然可以通过taxonomy ftp站点下载库文件,解析对应文件拿到物种分类,但nodes和name两个文件加起来也几百兆,一个小脚本带着这么几百兆的文件似乎也不方便。
通过fastAPI+mySQL做一个自己web API提供服务。
# 数据整理
先从ftp站点下载对应压缩包,其中 nodes.dmp和names.dmp内容及格式如下
# nodes.dmp
This file represents taxonomy nodes. The description for each node includes the following fields:
tax_id -- node id in GenBank taxonomy database
parent tax_id -- parent node id in GenBank taxonomy database
rank -- rank of this node (superkingdom, kingdom, ...)
embl code -- locus-name prefix; not unique
division id -- see division.dmp file
inherited div flag (1 or 0) -- 1 if node inherits division from parent
genetic code id -- see gencode.dmp file
inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
mitochondrial genetic code id -- see gencode.dmp file
inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
comments -- free-text comments and citations
# names.dmp
Taxonomy names file has these fields:
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (synonym, common name, ...)
# 信息解析
通过读取nodes中对应关系和names中的名称,将上述两个文件转换为一张物种表格。写一个小的python脚本进行转换。
import sys
import os
tax_sp = {}
tax_parent = {}
tax_name = {}
tax_sci_name = {}
# 定义全局变量
global sp_list
global class_list
def recur_tax(tax_par,tax_sci_name,tax_sp,taxid):
if taxid == '1':
return sp_list,class_list
sp_list.append(tax_sci_name[taxid])
class_list.append(tax_sp[taxid])
taxid = tax_par[taxid]
return recur_tax(tax_par,tax_sci_name,tax_sp,taxid)
with open('nodes.dmp') as f:
for line in f:
tmp = line.strip().split('|')
tax_sp[tmp[0].strip()] = tmp[2].strip()
tax_parent[tmp[0].strip()] = tmp[1].strip()
with open('names.dmp') as f:
for line in f:
tmp = line.strip().split('|')
if tmp[0].strip() not in tax_name.keys():
tax_name[tmp[0].strip()] = []
tax_name[tmp[0].strip()].append(tmp[1].strip())
if tmp[3].strip() == 'scientific name':
if tmp[0].strip() not in tax_sci_name.keys():
tax_sci_name[tmp[0].strip()] = tmp[1].strip()
with open('tax.txt','w') as f:
for key in tax_sp.keys():
sp_list = []
class_list = []
tmp_sp_list,tmp_class_list = recur_tax(tax_parent,tax_sci_name,tax_sp,key)
#if 'Bacteria' not in tmp_sp_list:
for name in set(tax_name[key]):
f.write('%s\t%s\t%s\t%s\n'%(name,key,','.join(tmp_sp_list),','.join(tmp_class_list)))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
似乎由于递归堆栈的关系,python2要比python3运行慢很多,使用python3运行该脚本。
# 数据入mySQL库
得到的tax.txt
即是一张大表,通过pymysql相关函数操作mySQL数据库,将对应结果存入数据库。
import pymysql
conn = pymysql.connect(host='',user='',passwd='',db='taxon',charset="utf8",port=3306)
cursor = conn.cursor()
cursor.execute("""create table if not exists taxon_class(
id int primary key auto_increment not null,
search_name varchar(255) ,
taxid int,
tax_name varchar(5000),
tax_class varchar(5000))engine=InnoDB default charset=utf8;""")
cursor.execute('ALTER TABLE taxon_class ADD INDEX search_name_index(search_name);')
with open('tax.txt') as f:
f.readline()
f.readline()
insert_sql = "insert into taxon_class(search_name,taxid,tax_name,tax_class)" + ' values(%s)'%(','.join(['%s']*4))
data_list = []
for line in f:
value = tuple(line.strip().split('\t'))
data_list.append(value)
cursor.executemany(insert_sql, data_list)
#提交
conn.commit()
#关闭游标
cursor.close()
#关闭连接
conn.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# 数据库查询
避免每次查询都需要对数据库进行连接,由此带来不必要的性能损耗,使用dbutils提供的工具创建连接池。
import pymysql
from dbutils.pooled_db import PooledDB
class MysqlPool:
config = {
'creator': pymysql,
'host': " ",
'port': ,
'user': " ",
'password': " ",
'db': "taxon",
'charset': 'utf8',
'maxconnections': 10, # 连接池最大连接数量
'cursorclass': pymysql.cursors.DictCursor
}
pool = PooledDB(**config)
def __enter__(self):
self.conn = MysqlPool.pool.connection()
self.cursor = self.conn.cursor()
return self
def __exit__(self, type, value, trace):
self.cursor.close()
self.conn.close()
def db_conn(func):
def wrapper(*args, **kw):
with MysqlPool() as db:
result = func(db, *args, **kw)
return result
return wrapper
@db_conn
def search_sp(db,sp):
q = "SELECT * FROM taxon.taxon_class WHERE search_name = '%s';"%sp
db.cursor.execute(q)
result = db.cursor.fetchall()
return result
print(search_sp('human'))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
返回结果如下:
[{'id': 3619413, 'search_name': 'human', 'taxid': 9606, 'tax_name': 'Homo sapiens,Homo,Homininae,Hominidae,Hominoidea,Catarrhini,Simiiformes,Haplorrhini,Primates,Euarchontoglires,Boreoeutheria,Eutheria,Theria,Mammalia,Amniota,Tetrapoda,Dipnotetrapodomorpha,Sarcopterygii,Euteleostomi,Teleostomi,Gnathostomata,Vertebrata,Craniata,Chordata,Deuterostomia,Bilateria,Eumetazoa,Metazoa,Opisthokonta,Eukaryota,cellular organisms', 'tax_class': 'species,genus,subfamily,family,superfamily,parvorder,infraorder,suborder,order,superorder,clade,clade,clade,class,clade,clade,clade,superclass,clade,clade,clade,clade,subphylum,phylum,clade,clade,clade,kingdom,clade,superkingdom,no rank'}]
1
fastAPI使用的是ORM模型,使用sqlalchemy进行查询
# fastAPI使用
建立如下目录结构:
.
└── taxon_api
├── __init__.py
├── crud.py
├── database.py
├── main.py
├── models.py
└── schemas.py
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
- init.py 是一个空文件,但是说明sql_app是一个package
- database.py 数据库配置相关
- models.py 数据库模型表
- schemas.py 模型验证
- crud.py 数据库操作相关
- main.py 主文件
database.py
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
SQLALCHEMY_DATABASE_URL = "mysql+pymysql://root:123456@localhost:3306/taxon"
# echo=True表示引擎将用repr()函数记录所有语句及其参数列表到日志
engine = create_engine(
SQLALCHEMY_DATABASE_URL, encoding='utf8', echo=True
)
# SQLAlchemy中,CRUD是通过会话进行管理的,所以需要先创建会话,
# 每一个SessionLocal实例就是一个数据库session
# flush指发送到数据库语句到数据库,但数据库不一定执行写入磁盘
# commit是指提交事务,将变更保存到数据库文件中
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
# 创建基本映射类
Base = declarative_base()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
models.py
from sqlalchemy import Boolean, Column, Integer, String
from database import Base
class Taxon_class(Base):
__tablename__ = "taxon_class"
id = Column(Integer, primary_key=True, index=True)
search_name = Column(String(255),index=True)
taxid = Column(Integer)
tax_name = Column(String(5000))
tax_class = Column(String(5000))
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
crud.py
from sqlalchemy.orm import Session
import models
def search(db: Session, sp:str):
return db.query(models.Taxon_class).filter(models.Taxon_class.search_name == sp).first()
1
2
3
4
5
6
2
3
4
5
6
main.py
from fastapi import FastAPI, Depends, HTTPException
import crud
from database import SessionLocal, engine, Base
from sqlalchemy.orm import Session
import uvicorn
Base.metadata.create_all(bind=engine) #数据库初始化,如果没有库或者表,会自动创建
app = FastAPI()
# Dependency
def get_db():
"""
每一个请求处理完毕后会关闭当前连接,不同的请求使用不同的连接
:return:
"""
db = SessionLocal()
try:
yield db
finally:
db.close()
@app.get("/taxon/")
async def read_taxon(sp: str = 'human',db: Session = Depends(get_db)):
db_sp = crud.search(db, sp=sp)
if not db_sp:
raise HTTPException(status_code=404, detail="Species Not Found")
return db_sp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# 运行fastAPI
使用uvicorn运行程序
uvicorn --host ip --port port --reload main:app
1