Source code for kinmatch.db

# -*- coding:UTF-8 -*-
from __future__ import unicode_literals
import os
import sys
import json
import csv
import time
from contextlib import contextmanager
from pymongo import MongoClient
import pandas as pd
import numpy as np
import time

from kinmatch import config
from kinmatch import utils
from kinmatch.utils import DataContainer
from kinmatch import (AlreadyExistException, 
                        FileTypeException,
                        NoValueException,
                        DBConnectionException,
                        ParameterException,
                        ParsingException)

@contextmanager
[문서]def get_mongodb(dbname=config.DEFAULT_MONGODB_NAME, user=config.DEFAULT_MONGODB_USER, password=config.DEFAULT_MONGODB_PASSWORD): client = MongoClient(config.MONGODB_URL) try: client.the_database.authenticate(user, password, source=dbname) except: raise DBConnectionException('Cannot connect database') try: yield client[dbname] except Exception as e: raise e else: pass finally: client.close()
[문서]class ResourceManager: def __init__(self, dbname=config.DEFAULT_MONGODB_NAME): self.dbname = dbname
[문서] def create(self, *args, **kwargs): raise NotImplementedError("create is not implemented.")
[문서] def read(self, *args, **kwargs): raise NotImplementedError("read is not implemented.")
[문서] def update(self, *args, **kwargs): raise NotImplementedError("update is not implemented.")
[문서] def delete(self, *args, **kwargs): raise NotImplementedError("delete is not implemented.")
[문서] def list(self, *args, **kwargs): raise NotImplementedError("list is not implemented.")
[문서]class TaskManager(ResourceManager):
[문서] def create(self, identifier, query_group, query_type, query_count,\ query_first, query_second, target_group, target_type, target_count,\ target_first, target_second, relationship_type, description,\ partial, astr_option, ystr_option, mtdna_option, dbname): self.identifier = identifier self.query_group = query_group self.query_type = query_type self.query_count = query_count self.query_first = query_first self.query_second = query_second self.target_group = target_group self.target_type = target_type self.target_count = target_count self.target_first = target_first self.target_second = target_second self.relationship_type = relationship_type self.description = description self.partial = partial self.astr_option = astr_option self.ystr_option = ystr_option self.mtdna_option = mtdna_option self.status = 'Submitted' with get_mongodb(self.dbname) as db: db.multiple_search_task.insert({'identifier': self.identifier,\ 'query_group': self.query_group,\ 'query_type' : self.query_type,\ 'query_first' : self.query_first,\ 'query_second' : self.query_second,\ 'query_count' : self.query_count,\ 'target_group' : self.target_group,\ 'target_type' : self.target_type,\ 'target_count' : self.target_count,\ 'target_first' : self.target_first,\ 'target_second' : self.target_second,\ 'status' : self.status,\ 'registered_time': time.strftime('%Y%m%H%M%S'),\ 'progress' : 0,\ 'result' : None,\ 'relationship_type': self.relationship_type,\ 'description': self.description,\ 'partial': self.partial,\ 'astr_option': self.astr_option,\ 'ystr_option': self.ystr_option,\ 'mtdna_option': self.mtdna_option,\ })
[문서] def get(self, identifier): with get_mongodb(self.dbname) as db: task = db.multiple_search_task.find_one({'identifier':identifier}) if task is None: raise NoValueException('There is no task {0}'.\ format(identifier)) return task
[문서] def read(self, identifier): task = self.get(identifier) del(task['_id']) if task['result']: task['result'] = pd.DataFrame(json.loads(task['result'])).T return DataContainer(status='Success', message='Fetching task identifier {}'.format(identifier), items=task)
[문서] def update(self, identifier, field, value): with get_mongodb(self.dbname) as db: db.multiple_search_task.update({'identifier': identifier},\ {"$set": {field: value}})
[문서] def delete(self, identifier): with get_mongodb(self.dbname) as db: db.multiple_search_task.remove({'identifier': identifier}) message = 'Deleted tasks {}'.format(identifier) return DataContainer(status='Success', message=message)
[문서] def list(self, identifier=None, query_group=None, query_type=None,\ target_group=None, target_type=None, description=None, status=None,\ sort='registered_time', desc=True, paginate_by=20,\ page=None): ## check arguments if query_type is not None and\ (query_type not in ['all', 'range', 'keyword', 'manual']): raise ParameterException('"{}" cannot be used for "query_type"' ' value'.format(query_type)) if sort not in ['identifier', 'query_group', 'query_type',\ 'query_count', 'target_group', 'target_type', 'target_count',\ 'status', 'progress', 'registered_time']: raise ParameterException('{} cannot be used for "sort" value'.\ format(sort)) ## get task from mongodb mongodb_query = {} for k,v in {'identifier': identifier, 'query_group': query_group,\ 'query_type': query_type, 'target_group': target_group,\ 'target_type': target_type, 'description': description,\ 'status': status}.items(): if v is not None: mongodb_query[k] = {'$regex': v} with get_mongodb(self.dbname) as db: tasks = db.multiple_search_task.find(mongodb_query) desc = -1 if desc else 1 tasks.sort(sort, desc) results = list() for task in tasks: if '_id' in task: del(task['_id']) if 'result' in task: del(task['result']) results.append(task) total_count = len(results) if page is not None: page = int(page) results = results[(page-1)*paginate_by : page*paginate_by] return DataContainer(status='Success', message='Fetching tasks', totalCount=total_count, totalPageCount=round(total_count/paginate_by), items=results)
[문서] def get_alltasks(self, field=None, query=None, sort='registered_time',\ desc=True): desc = -1 if desc else 1 with get_mongodb(self.dbname) as db: if field: tasks = db.multiple_search_task.find({field: {'$regex': query}}) else: tasks = db.multiple_search_task.find() tasks.sort(sort, desc) return tasks
[문서]class GroupManager(ResourceManager):
[문서] def create(self, identifier): pass
[문서] def read(self, identifier): pass
[문서] def update(self, identifier): pass
[문서] def delete(self, identifier): pass
[문서] def list(self): """ return group list """ pass
[문서] def ids_belong_group(self, group): """ return list of ids which belong to certain group """ with get_mongodb(self.dbname) as db: ids = [s['identifier'] for s in\ db.genotypes.find({'groups': group}, {'identifier': 1})] return ids
[문서]class AlleleFrequenciesManager(ResourceManager): current_allele_frequency = { 'astr_15': {'type_': 'A-STR', 'markers': ['D8S1179', 'D21S11', 'D7S820', 'CSF1PO', 'D3S1358', 'TH01', 'D13S317', 'D16S539', 'D2S1338', 'D19S433', 'vWA', 'TPOX', 'D18S51', 'D5S818', 'FGA']}, 'astr_23': {'type_': 'A-STR', 'markers': ['D8S1179', 'D21S11', 'D7S820', 'CSF1PO', 'D3S1358', 'TH01', 'D13S317', 'D16S539', 'D2S1338', 'D19S433', 'vWA', 'TPOX', 'D18S51', 'D5S818', 'FGA', 'Penta_E', 'Penta_D', 'D10S1248', 'D12S391', 'D1S1656', 'D22S1045', 'D2S441', 'SE33']}, 'ystr_16': {'type_': 'Y-STR', 'markers': ['DYS385', 'DYS389II', 'DYS392']}, 'ystr_23': {'type_': 'Y-STR', 'markers': ['DYS385', 'DYS389II', 'DYS392', 'DYS635', 'GATAH4__1']}, }
[문서] def create(self, infile, informat, name, type_, description=None): result = self.add(infile, informat, name, type_, overwrite=False,\ initial=False, description=description) ## return True or False if result is True: stat = 'Success' message = 'Success uploading {}'.format(name) else: stat = 'Failure' message = 'Failure uploading {}'.format(name) return DataContainer(status=stat, message=message)
[문서] def read(self, name, type_): result = self.get_from_read(name, type_) ## return dictionary # result = self.get(name, type_) ## return dataframe return DataContainer(status='Success',\ message='Success fetching Allele frequency {}'.format(name),\ items=result,\ trans=True)
[문서] def update(self, infile, informat, name, type_, description=None): result = self.add(infile, informat, name, type_, overwrite=True, initial=False, description=description) if result is True: stat = 'Success' message = 'Success uploading {}'.format(name) else: stat = 'Failure' message = 'Failure uploading {}'.format(name) return DataContainer(status=stat, message=message) ## return True or False
[문서] def delete(self, name): result = self.delete_allele_frequencies(name) ## return None if result is True: stat = 'Success' message = 'Success deleting {}'.format(name) else: stat = 'Failure' message = 'Failure deleting {}'.format(name) return DataContainer(status=stat, message=message)
[문서] def list(self): result = self.export_allele_frequency() ## return dataframe return DataContainer(status='Success', message='', items=result)
[문서] def add(self, infile, informat, name, type_, overwrite=False,\ initial=False, description=None): # if initial: # answer = input( # "'-t' option deletes all previous data. Is it right? (y/n) ") # if answer != 'y': # return with get_mongodb(self.dbname) as db: if initial: db.allele_frequencies.remove() prev_record = db.allele_frequencies.find_one({'name': name}) already_exist = False if prev_record and type_ in prev_record.keys(): already_exist = True if already_exist and not overwrite: raise AlreadyExistException('Allele frequencies {} already' ' existed'.format(name)) """ ext = os.path.splitext(infile.name)[-1].lower() if ext == '.csv': data = pd.read_csv(infile, index_col=0) elif ext in ('.xls', '.xlsx'): """ if informat.lower() == 'csv': data = pd.read_csv(infile, index_col=0) elif informat.lower() in ('xls', 'xlsx'): excel_file = pd.ExcelFile(infile) data = excel_file.parse(excel_file.sheet_names[0], index_col=0) #data = pd.read_excel(infile.name, 'Sheet1', index_col=0) else: raise FileTypeException('Input file must be CSV or XLS, XLSX') data.index = data.index * 10 data.index = data.index.astype(object).astype(int).astype(str) data.columns = [utils.encode_marker(column, type_) for column in\ data.columns] with get_mongodb(self.dbname) as db: if prev_record and \ ((not already_exist) or (already_exist and overwrite)): db.allele_frequencies.update({'name': name}, {'$set': { type_: data.to_dict(), 'description': description, }}) else: db.allele_frequencies.insert({ 'name': name, type_: data.to_dict(), 'description': description, }) return True return False
[문서] def make_current_db(self): #answer = input( # "Will you update current allele frequencies? (y/n) ") #if answer != 'y': # return name = 'current' gm = GenotypeManager(dbname=self.dbname) for name in self.current_allele_frequency: type_ = self.current_allele_frequency[name]['type_'] markers = self.current_allele_frequency[name]['markers'] df = pd.DataFrame() try: genotypes = gm.get_genotypes(type_) genotypes = genotypes[markers] for marker in markers: genotypes = genotypes[genotypes[marker].notnull()] except: continue for marker in markers: total_count = 0 alleles_count = {} for allele in genotypes[marker]: if type_ in ['A-STR','Y-STR']: try: allele_1, allele_2 = allele[0], allele[1] except: continue elif type_ == 'mtDNA': if not pd.isnull(allele): allele_1, allele_2 = allele, allele else: continue else: try: allele_1, allele_2 = allele[0], allele[0] except: continue total_count += 2 try: alleles_count[str(allele_1)][0] += 1 except: alleles_count[str(allele_1)] = [1] try: alleles_count[str(allele_2)][0] += 1 except: alleles_count[str(allele_2)] = [1] marker = utils.encode_marker(marker, type_) alleles_frequency = pd.DataFrame(alleles_count, index=[marker]) / float(total_count) df = df.append(alleles_frequency) if df: df = df.T with get_mongodb(self.dbname) as db: if db.allele_frequencies.find({'name': name}).count() > 0: db.allele_frequencies.update({'name': name}, {"$set": {type_: df.to_dict()}}) else: db.allele_frequencies.insert({ 'name': name, type_: df.to_dict() }) with get_mongodb(self.dbname) as db: if db.allele_frequencies.find({'name': name}).count() > 0: db.allele_frequencies.update({'name': name}, {"$set": {'description': 'allele frequencies status in current database'+ ' ({})'.format(time.strftime( '%Y/%m/%d %H:%M',time.localtime() ))}})
[문서] def get_names_info(self): name_and_info = [] with get_mongodb(self.dbname) as db: for record in db.allele_frequencies.find(): if 'A-STR' in record: type_ = 'A-STR' elif 'Y-STR' in record: type_ = 'Y-STR' elif 'mtDNA' in record: type_ = 'mtDNA' af = pd.DataFrame.from_dict(record[type_]) name_and_info.append([record['name'], type_, str(len(af.columns)), str(len(af.index))]) return name_and_info
[문서] def get(self, name, type_): with get_mongodb(self.dbname) as db: af = db.allele_frequencies.find_one( {'name': name, type_: {"$exists":1}}) if af is None: raise NoValueException('There is no Allele Frequency {0} {1}'.\ format(name, type_)) if type_ == 'description': return af['description'] # create dataframe df = pd.DataFrame.from_dict(af[type_]) df.index = df.index.astype(int) df.sort() return df
[문서] def get_from_read(self, name, type_): result = {} with get_mongodb(self.dbname) as db: try: af_types = list(db.allele_frequencies.find_one({'name': name}).\ keys()) except AttributeError: raise NoValueException('AlleleFrequency {} does not exist'.\ format(name)) if (type_ is not None) and (type_ not in af_types): return None af_types.remove('_id') af_types.remove('name') for type2 in af_types: df = self.get(name, type2) if type2 != 'description': df.index = [str(id_/10.0).replace('.0','') for id_ in\ df.index] result[type2] = df return result
[문서] def export_allele_frequency(self): column_names = [\ 'name', 'type', 'number of markers', 'number of alleles',\ 'description'] with get_mongodb(self.dbname) as db: result = list() for af in db.allele_frequencies.find(): desc = af['description'] if 'description' in af else None for type_ in af: if type_ not in ['name', '_id','description']: af_df = pd.DataFrame.from_dict(af[type_]) result.append([ af['name'], type_, len(af_df.columns), len(af_df.index), desc ]) return None if not result else pd.DataFrame(result,\ columns=column_names)
[문서] def delete_allele_frequencies(self, name): with get_mongodb(self.dbname) as db: db.allele_frequencies.remove({'name': name}) return True
[문서] def show(self, name, type_, outfile, seperator='tab', excelfile=''): if seperator == 'tab': sep = '\t' elif seperator == 'comma': sep = ',' df = self.get(name, type_) df.index = df.index / 10. if excelfile: df.to_excel(excelfile, sheet_name='Sheet1') else: df.to_csv(outfile, sep=sep)
[문서]class GenotypeManager(ResourceManager):
[문서] def create(self, infile, informat='GeneMark', type_='A-STR', group=None): if informat not in ['GeneMark', 'CSV', 'TSV', 'XLSX']: raise ParameterException('informat argument has wrong value') result = self.add_file(infile, type_, format=informat, overwrite=False, initial=False, group=group) ## return True if result is True: stat = 'Success' message = 'Success uploading {}'.format(type_) else: stat = 'Failure' message = 'Failure uploading {}'.format(type_) return DataContainer(status=stat, message=message)
[문서] def read(self, identifier, type_=None, kit='All', null='-'): result = self.get_genotype(identifier) del(result['_id']) del(result['identifier']) if not result: raise NoValueException('identifier {} does not exist'.format(\ identifier)) if type_ is not None: if type_ not in result: return DataContainer(status='Success',\ message='There is no {} related identifier'.format(\ type_)) # return None result = {type_: result[type_]} types = list(result.keys()) for type_ in types: if type_ == 'A-STR': gc = utils.ASTRGenotypeCleaner(result[type_], is_encoded=True) astrs = gc.decode() result[type_] = [({marker:astrs[marker]} if (marker in astrs)\ else {marker:null}) for marker in\ config.MARKERS_IN_KITS['A-STR'][kit]] elif type_ == 'Y-STR': gc = utils.YSTRGenotypeCleaner(result[type_], is_encoded=True) ystrs = gc.decode() result[type_] = [({marker:ystrs[marker]} if (marker in ystrs)\ else {marker:null}) for marker in\ config.MARKERS_IN_KITS['Y-STR'][kit]] elif type_ == 'mtDNA': gc = utils.MtdnaGenotypeCleaner(result[type_], is_encoded=True) mtdnas = gc.decode() result[type_] = mtdnas else: pass return DataContainer(status='Success',\ message='Success fetching Genotype {}'.format(identifier),\ items=result, trans=True)
[문서] def update(self, file_or_individual, *args, **kwargs): if file_or_individual == 'file': return self.update_by_file(*args, **kwargs) elif file_or_individual == 'individual': return self.update_by_genotype(*args, **kwargs)
[문서] def update_by_file(self, infile, informat='GeneMark', type_='A-STR',\ save_by='replacement', group=None): infile = iter(infile.readlines()) ## check save_bys if save_by not in ['replacement', 'merge-with-overwrite',\ 'merge-without-overwrite']: raise ParameterException('save_by argument has wrong value') if informat.lower() not in ['genemark', 'csv', 'tsv', 'xlsx']: raise ParameterException('informat argument has wrong value') ## parse file to make dataframe if informat.lower() == 'csv': df = pd.read_csv(infile, encoding='cp949', index_col=['identifier']) df = df.astype(object).fillna('').astype(str) df.columns = [str(c).replace('.0', '') for c in df.columns] if informat.lower() == 'tsv': df = pd.read_csv(infile, encoding='cp949',\ index_col=['identifier'], sep='\t') df = df.astype(object).fillna('').astype(str) df.columns = [str(c).replace('.0', '') for c in df.columns] elif informat.lower() == 'xlsx': excel_file = pd.ExcelFile(infile) df = excel_file.parse(excel_file.sheet_names[0])#, index_col=0) df = df.set_index('identifier') df = df.astype(object).fillna('').astype(str) df.columns = [str(c).replace('.0', '') for c in df.columns] elif informat.lower() == 'genemark': if type_ == 'mtDNA': df = self.mtdna_genmark_txt_to_df(infile) elif type_ in ['A-STR', 'Y-STR']: df = self.genmark_txt_to_df(infile, type_) df = df.set_index('identifier') if type_ == 'mtDNA': for m in df.columns: m_ori = utils.decode_marker(m, 'mtDNA') m = float(m_ori) if not ((16024 <= m <= 16365) | (73 <= m <= 340) |\ (438 <= m <= 574)): raise ParameterException('marker "{}" cannot used'.\ format(m_ori)) ## update database for identifier, series in df.iterrows(): series = series[series.notnull()] genotype = utils.encode_genotype(series.to_dict(), type_, None) if save_by == 'replacement': self.set_genotype(identifier, type_, genotype) elif save_by == 'merge-with-overwrite': self.set_genotype_merge_with_overwrite(identifier, type_,\ genotype) elif save_by == 'merge-without-overwrite': self.set_genotype_merge_without_overwrite(identifier, type_,\ genotype) stat = 'Success' message = 'Success uploading {}'.format(type_) return DataContainer(status=stat, message=message)
[문서] def update_by_genotype(self, type_, identifier, genotype, save_by='merge-with-overwrite'): if save_by not in ['merge-with-overwrite']: raise ParameterException('save_by argument has wrong value') if type_.lower() == 'mtDNA': for m,v in genotype.items(): if not ((16024 <= m <= 16365) | (73 <= m <= 340) |\ (438 <= m <= 574)): raise ParameterException('marker "{}" cannot used'.\ format(m)) encoded_genotype = {utils.encode_marker(m, type_):\ utils.encode_alleles(v, type_, utils.encode_marker(m, type_))\ for m,v in genotype.items()} self.set_genotype_merge_with_overwrite(identifier, type_,\ encoded_genotype) stat = 'Success' message = 'Success uploading {}'.format(type_) return DataContainer(status=stat, message=message)
[문서] def delete(self, identifier, genotype='all'): # result = self.delete_genotype(identifier) ## return None result = self.remove_genotype(identifier, genotype) if result is True: stat = 'Success' message = 'Success deleting {}'.format(identifier) else: stat = 'Failure' message = 'Failure deleting {}'.format(identifier) return DataContainer(status=stat, message=message)
[문서] def list(self, type_=None, identifier=None, page=None, paginate_by=10,\ sort=None, desc=False, filter_astr=False, filter_ystr=False,\ filter_mtdna=False): if type_ is None: temp_results = [] with get_mongodb(self.dbname) as db: if identifier: find_result = db.genotypes.find({'identifier':\ {'$regex': identifier}}) else: find_result = db.genotypes.find() for sample in find_result: id_ = sample['identifier'] del(sample['_id']) del(sample['identifier']) temp_result = {} for key in sample: if key == 'groups': temp_result[key] = ','.join(sample[key]) else: temp_result[key] = len([marker for marker in\ sample[key] if sample[key][marker] is\ not None]) temp_results.append([id_, temp_result]) if not temp_results: return DataContainer(status='Success',\ message='There is no Genotype "{}"'.\ format(type_), items=[]) result = pd.DataFrame.from_items(temp_results).T result = result.replace(np.nan, 0) if filter_astr: result = result.loc[(result['A-STR']!=0)] if filter_ystr: result = result.loc[(result['Y-STR']!=0)] if filter_mtdna: result = result.loc[(result['mtDNA']!=0)] else: result = self.get_genotypes(type_) total_count = result.shape[0] if sort is not None: if sort == 'identifier': index = list(result.index) index.sort(reverse=not desc) result = result.loc[index] else: result = result.sort(sort, ascending=not desc) if page is not None: result = result.iloc[(page-1)*paginate_by : page*paginate_by] if type_ is not None: result = {type_:result} return DataContainer(status='Success', message='Success fetching Genotype {}'.\ format(type_), items=result, totalCount=total_count, totalPageCount=round(total_count/paginate_by), trans=False, zeroization=True)
[문서] def delete_genotype(self, identifier): with get_mongodb(self.dbname) as db: db.genotypes.remove({'identifier': identifier}) return True
[문서] def get_genotype(self, identifier): with get_mongodb(self.dbname) as db: genotype = db.genotypes.find_one({'identifier': identifier}) new_genotype = {} if genotype: for type_, type_genotype in genotype.items(): if type_genotype in ('A-STR', 'Y-STR', 'mtDNA'): g = {} for marker, alleles in type_genotype.items(): #marker = utils.decode_marker(marker, type_) g[markers] = alleles new_genotype[type_] = g else: new_genotype[type_] = type_genotype else: raise AssertionError('There is no such identifier {}'.format( identifier)) return new_genotype
[문서] def get_genotype_with_none(self, identifier): genotype = self.get_genotype(identifier) if genotype: for type_ in ('A-STR', 'Y-STR'): if type_ in genotype: for marker, alleles in genotype[type_].items(): if type(alleles) != list: genotype[type_][marker] = None if 'mtDNA' in genotype: for position, allele in genotype['mtDNA'].items(): if pd.isnull(allele): genotype['mtDNA'][position] = None for type_ in ('A-STR', 'Y-STR', 'mtDNA'): if type_ in genotype: if not any(genotype[type_].values()): genotype.pop(type_) return genotype
[문서] def remove_genotype(self, identifier, genotype='all'): if genotype != 'all': sample_genotype = self.get_genotype(identifier) for key in ['_id', genotype]: try: sample_genotype.pop(key) except KeyError: pass with get_mongodb(self.dbname) as db: record = db.genotypes.find_one({'identifier':identifier}) if not record: raise NoValueException('identifier {} does not exist'.format(\ identifier)) db.genotypes.remove({'identifier' : identifier}) if genotype != 'all': db.genotypes.insert({k:v for k,v in sample_genotype.items()}) return True
[문서] def set_identifier(self, identifier): with get_mongodb(self.dbname) as db: db.genotypes.insert({'identifier':identifier})
[문서] def set_genotype(self, identifier, type_, genotype): with get_mongodb(self.dbname) as db: db.genotypes.update({'identifier': identifier}, {'$set': { type_: genotype, }}, True)
[문서] def set_genotype_merge_with_overwrite(self, identifier, type_, genotype): with get_mongodb(self.dbname) as db: merged_genotype = {} # prev = db.genotypes.find_one({'identifier': identifier})[type_] try: prev = db.genotypes.find_one({'identifier': identifier})[type_] except: raise NoValueException( 'There is no identifier "{0}" or "{1}"'.format(identifier, type_)) if prev: all_markers = set(list(prev.keys()) + list(genotype.keys())) else: all_markers = list(genotype.keys()) for marker in all_markers: merged_genotype[marker] =\ genotype[marker] if (marker in genotype and\ genotype[marker]) else prev[marker] db.genotypes.update({'identifier': identifier}, {'$set': { type_: merged_genotype, }}, True)
[문서] def set_genotype_merge_without_overwrite(self, identifier, type_, genotype): with get_mongodb(self.dbname) as db: merged_genotype = {} # prev = db.genotypes.find_one({'identifier': identifier})[type_] try: prev = db.genotypes.find_one({'identifier': identifier})[type_] except: raise NoValueException( 'There is no identifier "{0}" or "{1}"'.format(identifier, type_)) if prev: all_markers = set(list(prev.keys()) + list(genotype.keys())) else: all_markers = list(genotype.keys()) for marker in all_markers: merged_genotype[marker] =\ prev[marker] if (marker in prev and prev[marker])\ else genotype[marker] db.genotypes.update({'identifier': identifier}, {'$set': { type_: merged_genotype, }}, True)
[문서] def add_group(self, identifiers, group): with get_mongodb(self.dbname) as db: for identifier in identifiers: record = db.genotypes.find_one({'identifier': identifier}) if record: if 'groups' not in record: record['groups'] = [] if group not in record['groups']: record['groups'].append(group) else: record = {'groups':[group]} db.genotypes.update({'identifier': identifier}, {'$set': { 'groups': record['groups'], }})
[문서] def get_genotypes(self, type_): with get_mongodb(self.dbname) as db: cursor = db.genotypes.find({type_:{"$exists":1}}, {'identifier': 1, type_: 1}) genotypes = pd.DataFrame.from_items( (g['identifier'], g[type_]) for g in cursor).T if type_ == 'mtDNA': new_columns = [] for c in genotypes.columns: #new_columns.append(utils.decode_marker(c, type_)) new_columns.append(c) genotypes.columns = new_columns return genotypes
[문서] def get_genotypes_by_group(self, type_, group): with get_mongodb(self.dbname) as db: cursor = db.genotypes.find( {type_:{"$exists":1}, 'groups':{"$in":[group]}}, {'identifier': 1, type_: 1, 'groups': 1}) genotypes = pd.DataFrame.from_items( (g['identifier'], g[type_]) for g in cursor).T return genotypes
[문서] def set_initial(self): # answer = input( # "'-t' option deletes all previous data. Is it right? (y/n) ") # if answer != 'y': # return with get_mongodb(self.dbname) as db: db.genotypes.remove()
[문서] def add_file(self, infile, type_, format='GeneMark', overwrite=False,\ initial=False, group=None): if initial: self.set_initial() if type_ == 'mtDNA': if format == 'GeneMark': self._add_mtDNA_genemark_file(infile, overwrite=overwrite, group=group) elif format == 'JSON': self._add_mtDNA_JSON_file(infile, overwrite=overwrite, group=group) elif format == 'CSV': self._add_mtDNA_csv_file(infile, overwrite=overwrite, group=group) else: raise FileTypeException('Input file must be CSV, JSON or\ GeneMark') else: if format == 'GeneMark': self._add_file_genemark(infile, type_,\ overwrite=overwrite, group=group) elif format == 'JSON': self._add_file_json(infile, type_, overwrite=overwrite, group=group) elif format == 'CSV': self._add_file_csv(infile, type_, overwrite=overwrite, group=group) else: raise FileTypeException('Input file must be CSV, JSON or\ GeneMark') return True
def _add_mtDNA_genemark_file(self, infile, overwrite=False, group=None): type_ = 'mtDNA' headers = [] bodies = [] infile = iter(infile.readlines()) for line in infile: if type(line) != str: line = line.decode('utf-8') sp = line.strip().split('\t') if sp[0] == 'Specimen': headers = sp line = next(infile) if type(line) != str: line = line.decode('utf-8') bodies = line.strip().split('\t') break if headers == []: raise AssertionError('There is no Specimen in Genmapper file.') elif len(headers) != len(bodies): raise AssertionError('The number of header and body is different.') project = bodies[0] mtDNA = {} for i in range(1,len(headers)): try: trans, position, area = headers[i].split('___') except ValueError: raise Exception('marker name "{}" is not proper in Genmapper ' 'file.'.format(headers[i])) try: gene_code, remainder = bodies[i].split(' ') except ValueError: raise Exception('genotype "{}" is not proper in Genmapper ' 'file.'.format(bodies[i])) if gene_code == '-': gene_code = 'd' if trans == '-': additional_num = 1 start_index = headers.index(headers[i]) while headers.index(headers[i], start_index) != i: additional_num += 1 start_index = headers.index(headers[i], start_index+1) position = position + '.{}'.format(additional_num) mtDNA[position] = gene_code with get_mongodb(self.dbname) as db: prev_record = \ db.genotypes.find_one({'identifier':project}) if prev_record: if overwrite or (type_ not in prev_record.keys()): db.genotypes.update({'identifier':project}, {'$set': {type_:mtDNA}}) else: raise AlreadyExistException('Uploaded mtDNA example\ already existed') else: db.genotypes.insert({ 'identifier': project, type_: mtDNA}) if group: try: prev_group = db.genotypes.find_one( {'identifier': project})['groups'] except: prev_group = [] if group not in prev_group: prev_group.append(group) db.genotypes.update({'identifier': project}, {"$set": {'groups': prev_group}}) def _add_mtDNA_csv_file(self, infile, overwrite=False, group=None): csv_reader = csv.reader(infile) columns = next(csv_reader) type_ = 'mtDNA' with get_mongodb(self.dbname) as db: for record in csv_reader: identifier = record[0] motif_dict = {} for hv in record[1:4]: for motif_ in hv.split(): if motif_.startswith('불'): continue if motif_ == '16183C16189C': motif_ = '16183C,16189C' if motif_ == '309.1': motif_ = '309.1C' if motif_ == '235G315.1C': motif_ = '235G,315.1C' if motif_ == '315C.1C': motif_ = '315.1C' if motif_ == '16209Y16218T': motif_ = '16209Y,16218T' if '.p' in motif_: int_part = motif_.split('.p')[0] temp_series = [m for m in hv.split() if\ int_part in m] temp_series.remove(motif_) temp_series.sort() max_series_num = int(temp_series[-1][-2]) motif_ = '{}.{}{}'.format(int_part,max_series_num+1, motif_[-1]) for motif in [m for m in motif_.split(',') if m]: motif = motif.replace('?', '') if motif.endswith('.'): motif = motif.replace('.', '') if motif[-1] in '1234567890': motif = motif[:-1] """ if '.' in motif: motif_splited = motif.split('.') position = motif_splited[0] snp = motif_splited[1] if position[-1] not in '1234567890': position = position[:-1] else: position = motif[:-1] snp = motif[-1] """ position = motif[:-1] try: int_position = float(position) except: print("Not int: {}, {}".format(identifier, position)) #position = position.replace('.','__') position = utils.encode_marker(position, type_) snp = motif[-1] motif_dict[position] = snp prev_record = db.genotypes.find_one({'identifier': identifier}) if prev_record: if overwrite or (type_ not in prev_record): db.genotypes.update({'identifier': identifier}, {'$set': {type_: motif_dict}}) else: db.genotypes.insert({ 'identifier': identifier, type_: motif_dict, }) if group: try: prev_group = db.genotypes.find_one( {'identifier': prev_identifier})['groups'] except: prev_group = [] if group not in prev_group: prev_group.append(group) db.genotypes.update({'identifier': identifier}, {"$set": {'groups': prev_group}}) def _add_mtDNA_JSON_file(self, infile, overwrite=False, group=None): pass def _add_file_genemark(self, infile, type_, overwrite=False, group=None): infile = infile.readlines() infile = [line.decode('utf-8') if type(line) != str else line\ for line in infile] prev_identifier = '' markers = []; allele1s = []; allele2s = [] columns = [infile[0].split('\t').index(head) for head in\ ['Sample Name', 'Marker', 'Allele 1', 'Allele 2']] with get_mongodb(self.dbname) as db: for line in infile[1:]: words = line.strip().split('\t') identifier = words[columns[0]] marker = utils.encode_marker(words[columns[1]], type_) allele1 = words[columns[2]] allele2 = words[columns[3]] if not allele2: allele2 = allele1 if not prev_identifier or prev_identifier == identifier: markers.append(marker) allele1s.append(allele1) allele2s.append(allele2) else: marker_dict = {} for i, marker in enumerate(markers): alleles = allele1s[i], allele2s[i] marker_dict[marker] = utils.encode_alleles( alleles, type_, marker) prev_record = \ db.genotypes.find_one({'identifier': identifier}) if prev_record: if overwrite or (type_ not in prev_record.keys()): db.genotypes.update({'identifier': prev_identifier}, {'$set': {type_: marker_dict}}) else: raise AlreadyExistException('Uploaded astr\ example already existed') else: db.genotypes.insert({ 'identifier': prev_identifier, type_: marker_dict}) if group: try: prev_group = db.genotypes.find_one( {'identifier': prev_identifier})['groups'] except: prev_group = [] if group not in prev_group: prev_group.append(group) db.genotypes.update({'identifier': prev_identifier}, {"$set": {'groups': prev_group}}) markers = [marker] allele1s = [allele1] allele2s = [allele2] prev_identifier = identifier marker_dict = {} for i, marker in enumerate(markers): alleles = allele1s[i], allele2s[i] marker_dict[marker] = utils.encode_alleles( alleles, type_, marker) prev_record = \ db.genotypes.find_one({'identifier': identifier}) if prev_record: if overwrite or (type_ not in prev_record.keys()): db.genotypes.update({'identifier': prev_identifier}, {'$set': {type_: marker_dict}}) else: db.genotypes.insert({ 'identifier': prev_identifier, type_: marker_dict, }) if group: try: prev_group = db.genotypes.find_one( {'identifier': prev_identifier})['groups'] except: prev_group = [] if group not in prev_group: prev_group.append(group) db.genotypes.update({'identifier': prev_identifier}, {"$set": {'groups': prev_group}}) def _add_file_csv(self, infile, type_, overwrite=False, group=None): data = pd.read_csv(infile, index_col=0) with get_mongodb(self.dbname) as db: for identifier, str_ in data.iterrows(): try: prev_record =\ db.genotypes.find_one({'identifier':identifier}) if prev_record: if overwrite or (type_ not in prev_record.keys()): db.genotypes.update({'identifier':identifier}, {'$set': {type_: utils.encode_genotype(str_.to_dict(),\ type_) }}) else: db.genotypes.insert({ 'identifier': identifier, type_: utils.encode_genotype(str_.to_dict(), type_) }) except: print('ERROR in ', identifier,'\n', str_) if group: try: prev_group = db.genotypes.find_one( {'identifier': identifier})['groups'] except: prev_group = [] if group not in prev_group: prev_group.append(group) db.genotypes.update({'identifier': identifier}, {"$set": {'groups': prev_group}}) def _add_file_json(self, infile, type_, overwrite=False, group=None): genotypes_json = json.loads(infile.read()) with get_mongodb(self.dbname) as db: for genotype_json in genotypes_json: identifier = genotype_json['identifier'] if type_ in genotype_json: str_ = genotype_json[type_] for marker_name, alleles in str_.items(): str_[marker_name] = utils.encode_alleles( alleles, type_, marker_name) prev_record = \ db.genotypes.find_one({'identifier':identifier}) if prev_record: if overwrite or (type_ not in prev_record.keys()): db.genotypes.update({'identifier': identifier}, {'$set': {type_: str_}}) else: print('{} already exist, ' 'you can use overwrite option'.format(identifier)) continue else: db.genotypes.insert({ 'identifier': identifier, type_: str_, }) if group: try: prev_group = db.genotypes.find_one( {'identifier': prev_identifier})['groups'] except: prev_group = [] if group not in prev_group: prev_group.append(group) db.genotypes.update({'identifier': prev_identifier}, {"$set": {'groups': prev_group}})
[문서] def genmark_txt_to_df(self, txtfile_or_path, type_): if type(txtfile_or_path) == str: txtfile = open(txtfile_or_path, encoding='utf-8') else: txtfile = txtfile_or_path genotypes = [] column_line = next(txtfile) if type(column_line) != str: column_line = column_line.decode('utf-8') column_list = column_line.strip().split('\t') column_position = {column:position for position, column in\ enumerate(column_list)} for essential_column in ['Sample Name','Marker','Allele 1','Allele 2']: if essential_column not in column_list: raise ParsingException('There is no "{}" column in Genmapper\ file.'.format(essential_column)) index_sample_name = column_position['Sample Name'] index_marker = column_position['Marker'] index_allele_1 = column_position['Allele 1'] index_allele_2 = column_position['Allele 2'] max_index = max([index_sample_name, index_marker, index_allele_1,\ index_allele_2]) prev_identifier = '' genotype = {} line_number = 1 for line in txtfile: line_number += 1 if type(line) != str: line = line.decode('utf-8') line = line.strip() if not line or line.startswith('Sample Name'): continue words = line.split('\t') if len(words) < max_index: continue identifier = words[index_sample_name] try: marker = words[index_marker] except IndexError: raise ParsingException('There is no "marker" in Genmapper file' ' line {}.'.format(line_number)) marker = utils.encode_marker(marker, type_) if marker in config.IGNORE_MARKERS[type_]: continue try: allele1 = words[index_allele_1] except IndexError: raise ParsingException('There is no "allele1" in Genmapper file' ' line {}.'.format(line_number)) try: allele2 = words[index_allele_2] except IndexError: raise ParsingException('There is no "allele2" in Genmapper file' ' line {}.'.format(line_number)) if allele2 == '': allele2 = allele1 if prev_identifier and prev_identifier != identifier: genotype['identifier'] = prev_identifier genotypes.append(genotype) genotype = {} if marker in config.SEX_MARKERS: genotype.update({ marker: '{}{}'.format(allele1, allele2), }) else: genotype.update({ marker: '{}, {}'.format(allele1, allele2), }) prev_identifier = identifier if prev_identifier and genotype: genotype['identifier'] = prev_identifier genotypes.append(genotype) columns = [] for genotype in genotypes: columns += [key for key in genotype] columns = list(set(columns)) columns.remove('identifier') columns = ['identifier'] + columns df = pd.DataFrame(genotypes, columns=columns) return df
[문서] def mtdna_genmark_txt_to_df(self, txtfile_or_path): if type(txtfile_or_path) == str: txtfile = open(txtfile_or_path, encoding='utf-8') else: txtfile = txtfile_or_path headers = [] bodies = [] for line in txtfile: if type(line) != str: line = line.decode('utf-8') sp = line.strip().split('\t') if sp[0] == 'Specimen': headers = sp line = next(txtfile) if type(line) != str: line = line.decode('utf-8') bodies = line.strip().split('\t') break if headers == []: raise ParsingException('There is no Specimen in Genmapper file.') elif len(headers) != len(bodies): raise ParsingException('The numbers of header and body are\ different.') columns = ['identifier'] values = [bodies[0]] for i in range(1,len(headers)): try: trans, position, area = headers[i].split('___') except ValueError: raise ParsingException('marker name "{}" is not proper\ in Genmapper file.'.format(headers[i])) try: gene_code, remainder = bodies[i].split(' ') except ValueError: raise ParsingException('genotype "{}" is not proper in\ Genmapper file.'.format(bodies[i])) if gene_code == '-': gene_code = 'd' if trans == '-': additional_num = 1 start_index = headers.index(headers[i]) while headers.index(headers[i], start_index) != i: additional_num += 1 start_index = headers.index(headers[i], start_index+1) position = position + '.{}'.format(additional_num) columns.append(utils.encode_marker(position, 'mtDNA')) values.append(gene_code) df = pd.DataFrame(pd.Series(values)).T df.columns = columns return df