# -*- coding:UTF-8 -*-
from __future__ import unicode_literals
import os
import sys
import json
import csv
import time
from contextlib import contextmanager
from pymongo import MongoClient
import pandas as pd
import numpy as np
import time
from kinmatch import config
from kinmatch import utils
from kinmatch.utils import DataContainer
from kinmatch import (AlreadyExistException,
FileTypeException,
NoValueException,
DBConnectionException,
ParameterException,
ParsingException)
@contextmanager
[문서]def get_mongodb(dbname=config.DEFAULT_MONGODB_NAME,
user=config.DEFAULT_MONGODB_USER,
password=config.DEFAULT_MONGODB_PASSWORD):
client = MongoClient(config.MONGODB_URL)
try:
client.the_database.authenticate(user, password, source=dbname)
except:
raise DBConnectionException('Cannot connect database')
try:
yield client[dbname]
except Exception as e:
raise e
else:
pass
finally:
client.close()
[문서]class ResourceManager:
def __init__(self, dbname=config.DEFAULT_MONGODB_NAME):
self.dbname = dbname
[문서] def create(self, *args, **kwargs):
raise NotImplementedError("create is not implemented.")
[문서] def read(self, *args, **kwargs):
raise NotImplementedError("read is not implemented.")
[문서] def update(self, *args, **kwargs):
raise NotImplementedError("update is not implemented.")
[문서] def delete(self, *args, **kwargs):
raise NotImplementedError("delete is not implemented.")
[문서] def list(self, *args, **kwargs):
raise NotImplementedError("list is not implemented.")
[문서]class TaskManager(ResourceManager):
[문서] def create(self, identifier, query_group, query_type, query_count,\
query_first, query_second, target_group, target_type, target_count,\
target_first, target_second, relationship_type, description,\
partial, astr_option, ystr_option, mtdna_option, dbname):
self.identifier = identifier
self.query_group = query_group
self.query_type = query_type
self.query_count = query_count
self.query_first = query_first
self.query_second = query_second
self.target_group = target_group
self.target_type = target_type
self.target_count = target_count
self.target_first = target_first
self.target_second = target_second
self.relationship_type = relationship_type
self.description = description
self.partial = partial
self.astr_option = astr_option
self.ystr_option = ystr_option
self.mtdna_option = mtdna_option
self.status = 'Submitted'
with get_mongodb(self.dbname) as db:
db.multiple_search_task.insert({'identifier': self.identifier,\
'query_group': self.query_group,\
'query_type' : self.query_type,\
'query_first' : self.query_first,\
'query_second' : self.query_second,\
'query_count' : self.query_count,\
'target_group' : self.target_group,\
'target_type' : self.target_type,\
'target_count' : self.target_count,\
'target_first' : self.target_first,\
'target_second' : self.target_second,\
'status' : self.status,\
'registered_time': time.strftime('%Y%m%H%M%S'),\
'progress' : 0,\
'result' : None,\
'relationship_type': self.relationship_type,\
'description': self.description,\
'partial': self.partial,\
'astr_option': self.astr_option,\
'ystr_option': self.ystr_option,\
'mtdna_option': self.mtdna_option,\
})
[문서] def get(self, identifier):
with get_mongodb(self.dbname) as db:
task = db.multiple_search_task.find_one({'identifier':identifier})
if task is None:
raise NoValueException('There is no task {0}'.\
format(identifier))
return task
[문서] def read(self, identifier):
task = self.get(identifier)
del(task['_id'])
if task['result']:
task['result'] = pd.DataFrame(json.loads(task['result'])).T
return DataContainer(status='Success',
message='Fetching task identifier {}'.format(identifier),
items=task)
[문서] def update(self, identifier, field, value):
with get_mongodb(self.dbname) as db:
db.multiple_search_task.update({'identifier': identifier},\
{"$set": {field: value}})
[문서] def delete(self, identifier):
with get_mongodb(self.dbname) as db:
db.multiple_search_task.remove({'identifier': identifier})
message = 'Deleted tasks {}'.format(identifier)
return DataContainer(status='Success',
message=message)
[문서] def list(self, identifier=None, query_group=None, query_type=None,\
target_group=None, target_type=None, description=None, status=None,\
sort='registered_time', desc=True, paginate_by=20,\
page=None):
## check arguments
if query_type is not None and\
(query_type not in ['all', 'range', 'keyword', 'manual']):
raise ParameterException('"{}" cannot be used for "query_type"'
' value'.format(query_type))
if sort not in ['identifier', 'query_group', 'query_type',\
'query_count', 'target_group', 'target_type', 'target_count',\
'status', 'progress', 'registered_time']:
raise ParameterException('{} cannot be used for "sort" value'.\
format(sort))
## get task from mongodb
mongodb_query = {}
for k,v in {'identifier': identifier, 'query_group': query_group,\
'query_type': query_type, 'target_group': target_group,\
'target_type': target_type, 'description': description,\
'status': status}.items():
if v is not None:
mongodb_query[k] = {'$regex': v}
with get_mongodb(self.dbname) as db:
tasks = db.multiple_search_task.find(mongodb_query)
desc = -1 if desc else 1
tasks.sort(sort, desc)
results = list()
for task in tasks:
if '_id' in task:
del(task['_id'])
if 'result' in task:
del(task['result'])
results.append(task)
total_count = len(results)
if page is not None:
page = int(page)
results = results[(page-1)*paginate_by : page*paginate_by]
return DataContainer(status='Success',
message='Fetching tasks',
totalCount=total_count,
totalPageCount=round(total_count/paginate_by),
items=results)
[문서] def get_alltasks(self, field=None, query=None, sort='registered_time',\
desc=True):
desc = -1 if desc else 1
with get_mongodb(self.dbname) as db:
if field:
tasks = db.multiple_search_task.find({field: {'$regex': query}})
else:
tasks = db.multiple_search_task.find()
tasks.sort(sort, desc)
return tasks
[문서]class GroupManager(ResourceManager):
[문서] def create(self, identifier):
pass
[문서] def read(self, identifier):
pass
[문서] def update(self, identifier):
pass
[문서] def delete(self, identifier):
pass
[문서] def list(self):
"""
return group list
"""
pass
[문서] def ids_belong_group(self, group):
"""
return list of ids which belong to certain group
"""
with get_mongodb(self.dbname) as db:
ids = [s['identifier'] for s in\
db.genotypes.find({'groups': group}, {'identifier': 1})]
return ids
[문서]class AlleleFrequenciesManager(ResourceManager):
current_allele_frequency = {
'astr_15': {'type_': 'A-STR', 'markers':
['D8S1179', 'D21S11', 'D7S820', 'CSF1PO', 'D3S1358', 'TH01',
'D13S317', 'D16S539', 'D2S1338', 'D19S433', 'vWA', 'TPOX',
'D18S51', 'D5S818', 'FGA']},
'astr_23': {'type_': 'A-STR', 'markers':
['D8S1179', 'D21S11', 'D7S820', 'CSF1PO', 'D3S1358', 'TH01',
'D13S317', 'D16S539', 'D2S1338', 'D19S433', 'vWA', 'TPOX',
'D18S51', 'D5S818', 'FGA', 'Penta_E', 'Penta_D', 'D10S1248',
'D12S391', 'D1S1656', 'D22S1045', 'D2S441', 'SE33']},
'ystr_16': {'type_': 'Y-STR', 'markers':
['DYS385', 'DYS389II', 'DYS392']},
'ystr_23': {'type_': 'Y-STR', 'markers':
['DYS385', 'DYS389II', 'DYS392', 'DYS635', 'GATAH4__1']},
}
[문서] def create(self, infile, informat, name, type_, description=None):
result = self.add(infile, informat, name, type_, overwrite=False,\
initial=False, description=description) ## return True or False
if result is True:
stat = 'Success'
message = 'Success uploading {}'.format(name)
else:
stat = 'Failure'
message = 'Failure uploading {}'.format(name)
return DataContainer(status=stat, message=message)
[문서] def read(self, name, type_):
result = self.get_from_read(name, type_) ## return dictionary
# result = self.get(name, type_) ## return dataframe
return DataContainer(status='Success',\
message='Success fetching Allele frequency {}'.format(name),\
items=result,\
trans=True)
[문서] def update(self, infile, informat, name, type_, description=None):
result = self.add(infile, informat, name, type_, overwrite=True,
initial=False, description=description)
if result is True:
stat = 'Success'
message = 'Success uploading {}'.format(name)
else:
stat = 'Failure'
message = 'Failure uploading {}'.format(name)
return DataContainer(status=stat, message=message)
## return True or False
[문서] def delete(self, name):
result = self.delete_allele_frequencies(name) ## return None
if result is True:
stat = 'Success'
message = 'Success deleting {}'.format(name)
else:
stat = 'Failure'
message = 'Failure deleting {}'.format(name)
return DataContainer(status=stat, message=message)
[문서] def list(self):
result = self.export_allele_frequency() ## return dataframe
return DataContainer(status='Success',
message='',
items=result)
[문서] def add(self, infile, informat, name, type_, overwrite=False,\
initial=False, description=None):
# if initial:
# answer = input(
# "'-t' option deletes all previous data. Is it right? (y/n) ")
# if answer != 'y':
# return
with get_mongodb(self.dbname) as db:
if initial:
db.allele_frequencies.remove()
prev_record = db.allele_frequencies.find_one({'name': name})
already_exist = False
if prev_record and type_ in prev_record.keys():
already_exist = True
if already_exist and not overwrite:
raise AlreadyExistException('Allele frequencies {} already'
' existed'.format(name))
"""
ext = os.path.splitext(infile.name)[-1].lower()
if ext == '.csv':
data = pd.read_csv(infile, index_col=0)
elif ext in ('.xls', '.xlsx'):
"""
if informat.lower() == 'csv':
data = pd.read_csv(infile, index_col=0)
elif informat.lower() in ('xls', 'xlsx'):
excel_file = pd.ExcelFile(infile)
data = excel_file.parse(excel_file.sheet_names[0], index_col=0)
#data = pd.read_excel(infile.name, 'Sheet1', index_col=0)
else:
raise FileTypeException('Input file must be CSV or XLS, XLSX')
data.index = data.index * 10
data.index = data.index.astype(object).astype(int).astype(str)
data.columns = [utils.encode_marker(column, type_) for column in\
data.columns]
with get_mongodb(self.dbname) as db:
if prev_record and \
((not already_exist) or (already_exist and overwrite)):
db.allele_frequencies.update({'name': name}, {'$set': {
type_: data.to_dict(), 'description': description,
}})
else:
db.allele_frequencies.insert({
'name': name,
type_: data.to_dict(),
'description': description,
})
return True
return False
[문서] def make_current_db(self):
#answer = input(
# "Will you update current allele frequencies? (y/n) ")
#if answer != 'y':
# return
name = 'current'
gm = GenotypeManager(dbname=self.dbname)
for name in self.current_allele_frequency:
type_ = self.current_allele_frequency[name]['type_']
markers = self.current_allele_frequency[name]['markers']
df = pd.DataFrame()
try:
genotypes = gm.get_genotypes(type_)
genotypes = genotypes[markers]
for marker in markers:
genotypes = genotypes[genotypes[marker].notnull()]
except:
continue
for marker in markers:
total_count = 0
alleles_count = {}
for allele in genotypes[marker]:
if type_ in ['A-STR','Y-STR']:
try:
allele_1, allele_2 = allele[0], allele[1]
except:
continue
elif type_ == 'mtDNA':
if not pd.isnull(allele):
allele_1, allele_2 = allele, allele
else:
continue
else:
try:
allele_1, allele_2 = allele[0], allele[0]
except:
continue
total_count += 2
try:
alleles_count[str(allele_1)][0] += 1
except:
alleles_count[str(allele_1)] = [1]
try:
alleles_count[str(allele_2)][0] += 1
except:
alleles_count[str(allele_2)] = [1]
marker = utils.encode_marker(marker, type_)
alleles_frequency = pd.DataFrame(alleles_count,
index=[marker]) / float(total_count)
df = df.append(alleles_frequency)
if df:
df = df.T
with get_mongodb(self.dbname) as db:
if db.allele_frequencies.find({'name': name}).count() > 0:
db.allele_frequencies.update({'name': name},
{"$set": {type_: df.to_dict()}})
else:
db.allele_frequencies.insert({
'name': name, type_: df.to_dict()
})
with get_mongodb(self.dbname) as db:
if db.allele_frequencies.find({'name': name}).count() > 0:
db.allele_frequencies.update({'name': name},
{"$set": {'description':
'allele frequencies status in current database'+
' ({})'.format(time.strftime(
'%Y/%m/%d %H:%M',time.localtime()
))}})
[문서] def get_names_info(self):
name_and_info = []
with get_mongodb(self.dbname) as db:
for record in db.allele_frequencies.find():
if 'A-STR' in record:
type_ = 'A-STR'
elif 'Y-STR' in record:
type_ = 'Y-STR'
elif 'mtDNA' in record:
type_ = 'mtDNA'
af = pd.DataFrame.from_dict(record[type_])
name_and_info.append([record['name'], type_,
str(len(af.columns)), str(len(af.index))])
return name_and_info
[문서] def get(self, name, type_):
with get_mongodb(self.dbname) as db:
af = db.allele_frequencies.find_one(
{'name': name, type_: {"$exists":1}})
if af is None:
raise NoValueException('There is no Allele Frequency {0} {1}'.\
format(name, type_))
if type_ == 'description':
return af['description']
# create dataframe
df = pd.DataFrame.from_dict(af[type_])
df.index = df.index.astype(int)
df.sort()
return df
[문서] def get_from_read(self, name, type_):
result = {}
with get_mongodb(self.dbname) as db:
try:
af_types = list(db.allele_frequencies.find_one({'name': name}).\
keys())
except AttributeError:
raise NoValueException('AlleleFrequency {} does not exist'.\
format(name))
if (type_ is not None) and (type_ not in af_types):
return None
af_types.remove('_id')
af_types.remove('name')
for type2 in af_types:
df = self.get(name, type2)
if type2 != 'description':
df.index = [str(id_/10.0).replace('.0','') for id_ in\
df.index]
result[type2] = df
return result
[문서] def export_allele_frequency(self):
column_names = [\
'name', 'type', 'number of markers', 'number of alleles',\
'description']
with get_mongodb(self.dbname) as db:
result = list()
for af in db.allele_frequencies.find():
desc = af['description'] if 'description' in af else None
for type_ in af:
if type_ not in ['name', '_id','description']:
af_df = pd.DataFrame.from_dict(af[type_])
result.append([
af['name'],
type_,
len(af_df.columns),
len(af_df.index),
desc
])
return None if not result else pd.DataFrame(result,\
columns=column_names)
[문서] def delete_allele_frequencies(self, name):
with get_mongodb(self.dbname) as db:
db.allele_frequencies.remove({'name': name})
return True
[문서] def show(self, name, type_, outfile, seperator='tab', excelfile=''):
if seperator == 'tab':
sep = '\t'
elif seperator == 'comma':
sep = ','
df = self.get(name, type_)
df.index = df.index / 10.
if excelfile:
df.to_excel(excelfile, sheet_name='Sheet1')
else:
df.to_csv(outfile, sep=sep)
[문서]class GenotypeManager(ResourceManager):
[문서] def create(self, infile, informat='GeneMark', type_='A-STR', group=None):
if informat not in ['GeneMark', 'CSV', 'TSV', 'XLSX']:
raise ParameterException('informat argument has wrong value')
result = self.add_file(infile, type_, format=informat,
overwrite=False, initial=False, group=group) ## return True
if result is True:
stat = 'Success'
message = 'Success uploading {}'.format(type_)
else:
stat = 'Failure'
message = 'Failure uploading {}'.format(type_)
return DataContainer(status=stat, message=message)
[문서] def read(self, identifier, type_=None, kit='All', null='-'):
result = self.get_genotype(identifier)
del(result['_id'])
del(result['identifier'])
if not result:
raise NoValueException('identifier {} does not exist'.format(\
identifier))
if type_ is not None:
if type_ not in result:
return DataContainer(status='Success',\
message='There is no {} related identifier'.format(\
type_))
# return None
result = {type_: result[type_]}
types = list(result.keys())
for type_ in types:
if type_ == 'A-STR':
gc = utils.ASTRGenotypeCleaner(result[type_], is_encoded=True)
astrs = gc.decode()
result[type_] = [({marker:astrs[marker]} if (marker in astrs)\
else {marker:null}) for marker in\
config.MARKERS_IN_KITS['A-STR'][kit]]
elif type_ == 'Y-STR':
gc = utils.YSTRGenotypeCleaner(result[type_], is_encoded=True)
ystrs = gc.decode()
result[type_] = [({marker:ystrs[marker]} if (marker in ystrs)\
else {marker:null}) for marker in\
config.MARKERS_IN_KITS['Y-STR'][kit]]
elif type_ == 'mtDNA':
gc = utils.MtdnaGenotypeCleaner(result[type_], is_encoded=True)
mtdnas = gc.decode()
result[type_] = mtdnas
else:
pass
return DataContainer(status='Success',\
message='Success fetching Genotype {}'.format(identifier),\
items=result, trans=True)
[문서] def update(self, file_or_individual, *args, **kwargs):
if file_or_individual == 'file':
return self.update_by_file(*args, **kwargs)
elif file_or_individual == 'individual':
return self.update_by_genotype(*args, **kwargs)
[문서] def update_by_file(self, infile, informat='GeneMark', type_='A-STR',\
save_by='replacement', group=None):
infile = iter(infile.readlines())
## check save_bys
if save_by not in ['replacement', 'merge-with-overwrite',\
'merge-without-overwrite']:
raise ParameterException('save_by argument has wrong value')
if informat.lower() not in ['genemark', 'csv', 'tsv', 'xlsx']:
raise ParameterException('informat argument has wrong value')
## parse file to make dataframe
if informat.lower() == 'csv':
df = pd.read_csv(infile, encoding='cp949', index_col=['identifier'])
df = df.astype(object).fillna('').astype(str)
df.columns = [str(c).replace('.0', '') for c in df.columns]
if informat.lower() == 'tsv':
df = pd.read_csv(infile, encoding='cp949',\
index_col=['identifier'], sep='\t')
df = df.astype(object).fillna('').astype(str)
df.columns = [str(c).replace('.0', '') for c in df.columns]
elif informat.lower() == 'xlsx':
excel_file = pd.ExcelFile(infile)
df = excel_file.parse(excel_file.sheet_names[0])#, index_col=0)
df = df.set_index('identifier')
df = df.astype(object).fillna('').astype(str)
df.columns = [str(c).replace('.0', '') for c in df.columns]
elif informat.lower() == 'genemark':
if type_ == 'mtDNA':
df = self.mtdna_genmark_txt_to_df(infile)
elif type_ in ['A-STR', 'Y-STR']:
df = self.genmark_txt_to_df(infile, type_)
df = df.set_index('identifier')
if type_ == 'mtDNA':
for m in df.columns:
m_ori = utils.decode_marker(m, 'mtDNA')
m = float(m_ori)
if not ((16024 <= m <= 16365) | (73 <= m <= 340) |\
(438 <= m <= 574)):
raise ParameterException('marker "{}" cannot used'.\
format(m_ori))
## update database
for identifier, series in df.iterrows():
series = series[series.notnull()]
genotype = utils.encode_genotype(series.to_dict(), type_, None)
if save_by == 'replacement':
self.set_genotype(identifier, type_, genotype)
elif save_by == 'merge-with-overwrite':
self.set_genotype_merge_with_overwrite(identifier, type_,\
genotype)
elif save_by == 'merge-without-overwrite':
self.set_genotype_merge_without_overwrite(identifier, type_,\
genotype)
stat = 'Success'
message = 'Success uploading {}'.format(type_)
return DataContainer(status=stat, message=message)
[문서] def update_by_genotype(self, type_, identifier, genotype,
save_by='merge-with-overwrite'):
if save_by not in ['merge-with-overwrite']:
raise ParameterException('save_by argument has wrong value')
if type_.lower() == 'mtDNA':
for m,v in genotype.items():
if not ((16024 <= m <= 16365) | (73 <= m <= 340) |\
(438 <= m <= 574)):
raise ParameterException('marker "{}" cannot used'.\
format(m))
encoded_genotype = {utils.encode_marker(m, type_):\
utils.encode_alleles(v, type_, utils.encode_marker(m, type_))\
for m,v in genotype.items()}
self.set_genotype_merge_with_overwrite(identifier, type_,\
encoded_genotype)
stat = 'Success'
message = 'Success uploading {}'.format(type_)
return DataContainer(status=stat, message=message)
[문서] def delete(self, identifier, genotype='all'):
# result = self.delete_genotype(identifier) ## return None
result = self.remove_genotype(identifier, genotype)
if result is True:
stat = 'Success'
message = 'Success deleting {}'.format(identifier)
else:
stat = 'Failure'
message = 'Failure deleting {}'.format(identifier)
return DataContainer(status=stat, message=message)
[문서] def list(self, type_=None, identifier=None, page=None, paginate_by=10,\
sort=None, desc=False, filter_astr=False, filter_ystr=False,\
filter_mtdna=False):
if type_ is None:
temp_results = []
with get_mongodb(self.dbname) as db:
if identifier:
find_result = db.genotypes.find({'identifier':\
{'$regex': identifier}})
else:
find_result = db.genotypes.find()
for sample in find_result:
id_ = sample['identifier']
del(sample['_id'])
del(sample['identifier'])
temp_result = {}
for key in sample:
if key == 'groups':
temp_result[key] = ','.join(sample[key])
else:
temp_result[key] = len([marker for marker in\
sample[key] if sample[key][marker] is\
not None])
temp_results.append([id_, temp_result])
if not temp_results:
return DataContainer(status='Success',\
message='There is no Genotype "{}"'.\
format(type_), items=[])
result = pd.DataFrame.from_items(temp_results).T
result = result.replace(np.nan, 0)
if filter_astr:
result = result.loc[(result['A-STR']!=0)]
if filter_ystr:
result = result.loc[(result['Y-STR']!=0)]
if filter_mtdna:
result = result.loc[(result['mtDNA']!=0)]
else:
result = self.get_genotypes(type_)
total_count = result.shape[0]
if sort is not None:
if sort == 'identifier':
index = list(result.index)
index.sort(reverse=not desc)
result = result.loc[index]
else:
result = result.sort(sort, ascending=not desc)
if page is not None:
result = result.iloc[(page-1)*paginate_by : page*paginate_by]
if type_ is not None:
result = {type_:result}
return DataContainer(status='Success',
message='Success fetching Genotype {}'.\
format(type_),
items=result,
totalCount=total_count,
totalPageCount=round(total_count/paginate_by),
trans=False,
zeroization=True)
[문서] def delete_genotype(self, identifier):
with get_mongodb(self.dbname) as db:
db.genotypes.remove({'identifier': identifier})
return True
[문서] def get_genotype(self, identifier):
with get_mongodb(self.dbname) as db:
genotype = db.genotypes.find_one({'identifier': identifier})
new_genotype = {}
if genotype:
for type_, type_genotype in genotype.items():
if type_genotype in ('A-STR', 'Y-STR', 'mtDNA'):
g = {}
for marker, alleles in type_genotype.items():
#marker = utils.decode_marker(marker, type_)
g[markers] = alleles
new_genotype[type_] = g
else:
new_genotype[type_] = type_genotype
else:
raise AssertionError('There is no such identifier {}'.format(
identifier))
return new_genotype
[문서] def get_genotype_with_none(self, identifier):
genotype = self.get_genotype(identifier)
if genotype:
for type_ in ('A-STR', 'Y-STR'):
if type_ in genotype:
for marker, alleles in genotype[type_].items():
if type(alleles) != list:
genotype[type_][marker] = None
if 'mtDNA' in genotype:
for position, allele in genotype['mtDNA'].items():
if pd.isnull(allele):
genotype['mtDNA'][position] = None
for type_ in ('A-STR', 'Y-STR', 'mtDNA'):
if type_ in genotype:
if not any(genotype[type_].values()):
genotype.pop(type_)
return genotype
[문서] def remove_genotype(self, identifier, genotype='all'):
if genotype != 'all':
sample_genotype = self.get_genotype(identifier)
for key in ['_id', genotype]:
try:
sample_genotype.pop(key)
except KeyError:
pass
with get_mongodb(self.dbname) as db:
record = db.genotypes.find_one({'identifier':identifier})
if not record:
raise NoValueException('identifier {} does not exist'.format(\
identifier))
db.genotypes.remove({'identifier' : identifier})
if genotype != 'all':
db.genotypes.insert({k:v for k,v in sample_genotype.items()})
return True
[문서] def set_identifier(self, identifier):
with get_mongodb(self.dbname) as db:
db.genotypes.insert({'identifier':identifier})
[문서] def set_genotype(self, identifier, type_, genotype):
with get_mongodb(self.dbname) as db:
db.genotypes.update({'identifier': identifier}, {'$set': {
type_: genotype,
}}, True)
[문서] def set_genotype_merge_with_overwrite(self, identifier, type_, genotype):
with get_mongodb(self.dbname) as db:
merged_genotype = {}
# prev = db.genotypes.find_one({'identifier': identifier})[type_]
try:
prev = db.genotypes.find_one({'identifier': identifier})[type_]
except:
raise NoValueException(
'There is no identifier "{0}" or "{1}"'.format(identifier, type_))
if prev:
all_markers = set(list(prev.keys()) + list(genotype.keys()))
else:
all_markers = list(genotype.keys())
for marker in all_markers:
merged_genotype[marker] =\
genotype[marker] if (marker in genotype and\
genotype[marker]) else prev[marker]
db.genotypes.update({'identifier': identifier}, {'$set': {
type_: merged_genotype,
}}, True)
[문서] def set_genotype_merge_without_overwrite(self, identifier, type_, genotype):
with get_mongodb(self.dbname) as db:
merged_genotype = {}
# prev = db.genotypes.find_one({'identifier': identifier})[type_]
try:
prev = db.genotypes.find_one({'identifier': identifier})[type_]
except:
raise NoValueException(
'There is no identifier "{0}" or "{1}"'.format(identifier, type_))
if prev:
all_markers = set(list(prev.keys()) + list(genotype.keys()))
else:
all_markers = list(genotype.keys())
for marker in all_markers:
merged_genotype[marker] =\
prev[marker] if (marker in prev and prev[marker])\
else genotype[marker]
db.genotypes.update({'identifier': identifier}, {'$set': {
type_: merged_genotype,
}}, True)
[문서] def add_group(self, identifiers, group):
with get_mongodb(self.dbname) as db:
for identifier in identifiers:
record = db.genotypes.find_one({'identifier': identifier})
if record:
if 'groups' not in record:
record['groups'] = []
if group not in record['groups']:
record['groups'].append(group)
else:
record = {'groups':[group]}
db.genotypes.update({'identifier': identifier}, {'$set': {
'groups': record['groups'],
}})
[문서] def get_genotypes(self, type_):
with get_mongodb(self.dbname) as db:
cursor = db.genotypes.find({type_:{"$exists":1}},
{'identifier': 1, type_: 1})
genotypes = pd.DataFrame.from_items(
(g['identifier'], g[type_]) for g in cursor).T
if type_ == 'mtDNA':
new_columns = []
for c in genotypes.columns:
#new_columns.append(utils.decode_marker(c, type_))
new_columns.append(c)
genotypes.columns = new_columns
return genotypes
[문서] def get_genotypes_by_group(self, type_, group):
with get_mongodb(self.dbname) as db:
cursor = db.genotypes.find(
{type_:{"$exists":1}, 'groups':{"$in":[group]}},
{'identifier': 1, type_: 1, 'groups': 1})
genotypes = pd.DataFrame.from_items(
(g['identifier'], g[type_]) for g in cursor).T
return genotypes
[문서] def set_initial(self):
# answer = input(
# "'-t' option deletes all previous data. Is it right? (y/n) ")
# if answer != 'y':
# return
with get_mongodb(self.dbname) as db:
db.genotypes.remove()
[문서] def add_file(self, infile, type_, format='GeneMark', overwrite=False,\
initial=False, group=None):
if initial:
self.set_initial()
if type_ == 'mtDNA':
if format == 'GeneMark':
self._add_mtDNA_genemark_file(infile,
overwrite=overwrite, group=group)
elif format == 'JSON':
self._add_mtDNA_JSON_file(infile,
overwrite=overwrite, group=group)
elif format == 'CSV':
self._add_mtDNA_csv_file(infile,
overwrite=overwrite, group=group)
else:
raise FileTypeException('Input file must be CSV, JSON or\
GeneMark')
else:
if format == 'GeneMark':
self._add_file_genemark(infile, type_,\
overwrite=overwrite, group=group)
elif format == 'JSON':
self._add_file_json(infile, type_, overwrite=overwrite,
group=group)
elif format == 'CSV':
self._add_file_csv(infile, type_, overwrite=overwrite,
group=group)
else:
raise FileTypeException('Input file must be CSV, JSON or\
GeneMark')
return True
def _add_mtDNA_genemark_file(self, infile, overwrite=False,
group=None):
type_ = 'mtDNA'
headers = []
bodies = []
infile = iter(infile.readlines())
for line in infile:
if type(line) != str:
line = line.decode('utf-8')
sp = line.strip().split('\t')
if sp[0] == 'Specimen':
headers = sp
line = next(infile)
if type(line) != str:
line = line.decode('utf-8')
bodies = line.strip().split('\t')
break
if headers == []:
raise AssertionError('There is no Specimen in Genmapper file.')
elif len(headers) != len(bodies):
raise AssertionError('The number of header and body is different.')
project = bodies[0]
mtDNA = {}
for i in range(1,len(headers)):
try:
trans, position, area = headers[i].split('___')
except ValueError:
raise Exception('marker name "{}" is not proper in Genmapper '
'file.'.format(headers[i]))
try:
gene_code, remainder = bodies[i].split(' ')
except ValueError:
raise Exception('genotype "{}" is not proper in Genmapper '
'file.'.format(bodies[i]))
if gene_code == '-':
gene_code = 'd'
if trans == '-':
additional_num = 1
start_index = headers.index(headers[i])
while headers.index(headers[i], start_index) != i:
additional_num += 1
start_index = headers.index(headers[i], start_index+1)
position = position + '.{}'.format(additional_num)
mtDNA[position] = gene_code
with get_mongodb(self.dbname) as db:
prev_record = \
db.genotypes.find_one({'identifier':project})
if prev_record:
if overwrite or (type_ not in prev_record.keys()):
db.genotypes.update({'identifier':project},
{'$set': {type_:mtDNA}})
else:
raise AlreadyExistException('Uploaded mtDNA example\
already existed')
else:
db.genotypes.insert({
'identifier': project,
type_: mtDNA})
if group:
try:
prev_group = db.genotypes.find_one(
{'identifier': project})['groups']
except:
prev_group = []
if group not in prev_group:
prev_group.append(group)
db.genotypes.update({'identifier': project},
{"$set": {'groups': prev_group}})
def _add_mtDNA_csv_file(self, infile, overwrite=False, group=None):
csv_reader = csv.reader(infile)
columns = next(csv_reader)
type_ = 'mtDNA'
with get_mongodb(self.dbname) as db:
for record in csv_reader:
identifier = record[0]
motif_dict = {}
for hv in record[1:4]:
for motif_ in hv.split():
if motif_.startswith('불'):
continue
if motif_ == '16183C16189C':
motif_ = '16183C,16189C'
if motif_ == '309.1':
motif_ = '309.1C'
if motif_ == '235G315.1C':
motif_ = '235G,315.1C'
if motif_ == '315C.1C':
motif_ = '315.1C'
if motif_ == '16209Y16218T':
motif_ = '16209Y,16218T'
if '.p' in motif_:
int_part = motif_.split('.p')[0]
temp_series = [m for m in hv.split() if\
int_part in m]
temp_series.remove(motif_)
temp_series.sort()
max_series_num = int(temp_series[-1][-2])
motif_ = '{}.{}{}'.format(int_part,max_series_num+1,
motif_[-1])
for motif in [m for m in motif_.split(',') if m]:
motif = motif.replace('?', '')
if motif.endswith('.'):
motif = motif.replace('.', '')
if motif[-1] in '1234567890':
motif = motif[:-1]
"""
if '.' in motif:
motif_splited = motif.split('.')
position = motif_splited[0]
snp = motif_splited[1]
if position[-1] not in '1234567890':
position = position[:-1]
else:
position = motif[:-1]
snp = motif[-1]
"""
position = motif[:-1]
try:
int_position = float(position)
except:
print("Not int: {}, {}".format(identifier,
position))
#position = position.replace('.','__')
position = utils.encode_marker(position, type_)
snp = motif[-1]
motif_dict[position] = snp
prev_record = db.genotypes.find_one({'identifier': identifier})
if prev_record:
if overwrite or (type_ not in prev_record):
db.genotypes.update({'identifier': identifier},
{'$set': {type_: motif_dict}})
else:
db.genotypes.insert({
'identifier': identifier,
type_: motif_dict,
})
if group:
try:
prev_group = db.genotypes.find_one(
{'identifier': prev_identifier})['groups']
except:
prev_group = []
if group not in prev_group:
prev_group.append(group)
db.genotypes.update({'identifier': identifier},
{"$set": {'groups': prev_group}})
def _add_mtDNA_JSON_file(self, infile, overwrite=False, group=None):
pass
def _add_file_genemark(self, infile, type_, overwrite=False, group=None):
infile = infile.readlines()
infile = [line.decode('utf-8') if type(line) != str else line\
for line in infile]
prev_identifier = ''
markers = []; allele1s = []; allele2s = []
columns = [infile[0].split('\t').index(head) for head in\
['Sample Name', 'Marker', 'Allele 1', 'Allele 2']]
with get_mongodb(self.dbname) as db:
for line in infile[1:]:
words = line.strip().split('\t')
identifier = words[columns[0]]
marker = utils.encode_marker(words[columns[1]], type_)
allele1 = words[columns[2]]
allele2 = words[columns[3]]
if not allele2:
allele2 = allele1
if not prev_identifier or prev_identifier == identifier:
markers.append(marker)
allele1s.append(allele1)
allele2s.append(allele2)
else:
marker_dict = {}
for i, marker in enumerate(markers):
alleles = allele1s[i], allele2s[i]
marker_dict[marker] = utils.encode_alleles(
alleles, type_, marker)
prev_record = \
db.genotypes.find_one({'identifier': identifier})
if prev_record:
if overwrite or (type_ not in prev_record.keys()):
db.genotypes.update({'identifier': prev_identifier},
{'$set': {type_: marker_dict}})
else:
raise AlreadyExistException('Uploaded astr\
example already existed')
else:
db.genotypes.insert({
'identifier': prev_identifier,
type_: marker_dict})
if group:
try:
prev_group = db.genotypes.find_one(
{'identifier': prev_identifier})['groups']
except:
prev_group = []
if group not in prev_group:
prev_group.append(group)
db.genotypes.update({'identifier': prev_identifier},
{"$set": {'groups': prev_group}})
markers = [marker]
allele1s = [allele1]
allele2s = [allele2]
prev_identifier = identifier
marker_dict = {}
for i, marker in enumerate(markers):
alleles = allele1s[i], allele2s[i]
marker_dict[marker] = utils.encode_alleles(
alleles, type_, marker)
prev_record = \
db.genotypes.find_one({'identifier': identifier})
if prev_record:
if overwrite or (type_ not in prev_record.keys()):
db.genotypes.update({'identifier': prev_identifier},
{'$set': {type_: marker_dict}})
else:
db.genotypes.insert({
'identifier': prev_identifier,
type_: marker_dict,
})
if group:
try:
prev_group = db.genotypes.find_one(
{'identifier': prev_identifier})['groups']
except:
prev_group = []
if group not in prev_group:
prev_group.append(group)
db.genotypes.update({'identifier': prev_identifier},
{"$set": {'groups': prev_group}})
def _add_file_csv(self, infile, type_, overwrite=False, group=None):
data = pd.read_csv(infile, index_col=0)
with get_mongodb(self.dbname) as db:
for identifier, str_ in data.iterrows():
try:
prev_record =\
db.genotypes.find_one({'identifier':identifier})
if prev_record:
if overwrite or (type_ not in prev_record.keys()):
db.genotypes.update({'identifier':identifier},
{'$set': {type_:
utils.encode_genotype(str_.to_dict(),\
type_)
}})
else:
db.genotypes.insert({
'identifier': identifier,
type_: utils.encode_genotype(str_.to_dict(), type_)
})
except:
print('ERROR in ', identifier,'\n', str_)
if group:
try:
prev_group = db.genotypes.find_one(
{'identifier': identifier})['groups']
except:
prev_group = []
if group not in prev_group:
prev_group.append(group)
db.genotypes.update({'identifier': identifier},
{"$set": {'groups': prev_group}})
def _add_file_json(self, infile, type_, overwrite=False, group=None):
genotypes_json = json.loads(infile.read())
with get_mongodb(self.dbname) as db:
for genotype_json in genotypes_json:
identifier = genotype_json['identifier']
if type_ in genotype_json:
str_ = genotype_json[type_]
for marker_name, alleles in str_.items():
str_[marker_name] = utils.encode_alleles(
alleles, type_, marker_name)
prev_record = \
db.genotypes.find_one({'identifier':identifier})
if prev_record:
if overwrite or (type_ not in prev_record.keys()):
db.genotypes.update({'identifier': identifier},
{'$set': {type_: str_}})
else:
print('{} already exist, '
'you can use overwrite option'.format(identifier))
continue
else:
db.genotypes.insert({
'identifier': identifier,
type_: str_,
})
if group:
try:
prev_group = db.genotypes.find_one(
{'identifier': prev_identifier})['groups']
except:
prev_group = []
if group not in prev_group:
prev_group.append(group)
db.genotypes.update({'identifier': prev_identifier},
{"$set": {'groups': prev_group}})
[문서] def genmark_txt_to_df(self, txtfile_or_path, type_):
if type(txtfile_or_path) == str:
txtfile = open(txtfile_or_path, encoding='utf-8')
else:
txtfile = txtfile_or_path
genotypes = []
column_line = next(txtfile)
if type(column_line) != str:
column_line = column_line.decode('utf-8')
column_list = column_line.strip().split('\t')
column_position = {column:position for position, column in\
enumerate(column_list)}
for essential_column in ['Sample Name','Marker','Allele 1','Allele 2']:
if essential_column not in column_list:
raise ParsingException('There is no "{}" column in Genmapper\
file.'.format(essential_column))
index_sample_name = column_position['Sample Name']
index_marker = column_position['Marker']
index_allele_1 = column_position['Allele 1']
index_allele_2 = column_position['Allele 2']
max_index = max([index_sample_name, index_marker, index_allele_1,\
index_allele_2])
prev_identifier = ''
genotype = {}
line_number = 1
for line in txtfile:
line_number += 1
if type(line) != str:
line = line.decode('utf-8')
line = line.strip()
if not line or line.startswith('Sample Name'):
continue
words = line.split('\t')
if len(words) < max_index:
continue
identifier = words[index_sample_name]
try:
marker = words[index_marker]
except IndexError:
raise ParsingException('There is no "marker" in Genmapper file'
' line {}.'.format(line_number))
marker = utils.encode_marker(marker, type_)
if marker in config.IGNORE_MARKERS[type_]:
continue
try:
allele1 = words[index_allele_1]
except IndexError:
raise ParsingException('There is no "allele1" in Genmapper file'
' line {}.'.format(line_number))
try:
allele2 = words[index_allele_2]
except IndexError:
raise ParsingException('There is no "allele2" in Genmapper file'
' line {}.'.format(line_number))
if allele2 == '':
allele2 = allele1
if prev_identifier and prev_identifier != identifier:
genotype['identifier'] = prev_identifier
genotypes.append(genotype)
genotype = {}
if marker in config.SEX_MARKERS:
genotype.update({
marker: '{}{}'.format(allele1, allele2),
})
else:
genotype.update({
marker: '{}, {}'.format(allele1, allele2),
})
prev_identifier = identifier
if prev_identifier and genotype:
genotype['identifier'] = prev_identifier
genotypes.append(genotype)
columns = []
for genotype in genotypes:
columns += [key for key in genotype]
columns = list(set(columns))
columns.remove('identifier')
columns = ['identifier'] + columns
df = pd.DataFrame(genotypes, columns=columns)
return df
[문서] def mtdna_genmark_txt_to_df(self, txtfile_or_path):
if type(txtfile_or_path) == str:
txtfile = open(txtfile_or_path, encoding='utf-8')
else:
txtfile = txtfile_or_path
headers = []
bodies = []
for line in txtfile:
if type(line) != str:
line = line.decode('utf-8')
sp = line.strip().split('\t')
if sp[0] == 'Specimen':
headers = sp
line = next(txtfile)
if type(line) != str:
line = line.decode('utf-8')
bodies = line.strip().split('\t')
break
if headers == []:
raise ParsingException('There is no Specimen in Genmapper file.')
elif len(headers) != len(bodies):
raise ParsingException('The numbers of header and body are\
different.')
columns = ['identifier']
values = [bodies[0]]
for i in range(1,len(headers)):
try:
trans, position, area = headers[i].split('___')
except ValueError:
raise ParsingException('marker name "{}" is not proper\
in Genmapper file.'.format(headers[i]))
try:
gene_code, remainder = bodies[i].split(' ')
except ValueError:
raise ParsingException('genotype "{}" is not proper in\
Genmapper file.'.format(bodies[i]))
if gene_code == '-':
gene_code = 'd'
if trans == '-':
additional_num = 1
start_index = headers.index(headers[i])
while headers.index(headers[i], start_index) != i:
additional_num += 1
start_index = headers.index(headers[i], start_index+1)
position = position + '.{}'.format(additional_num)
columns.append(utils.encode_marker(position, 'mtDNA'))
values.append(gene_code)
df = pd.DataFrame(pd.Series(values)).T
df.columns = columns
return df