Source code for kinmatch.db

# -*- coding:UTF-8 -*-
from __future__ import unicode_literals
import os
import sys
import json
import csv
import time
from contextlib import contextmanager
from pymongo import MongoClient
import pandas as pd
import numpy as np
import time

from kinmatch import config
from kinmatch import utils
from kinmatch.utils import DataContainer
from kinmatch import (AlreadyExistException, 
                        FileTypeException,
                        NoValueException,
                        DBConnectionException,
                        ParameterException,
                        ParsingException)

@contextmanager
[문서]def get_mongodb(dbname=config.DEFAULT_MONGODB_NAME, 
        user=config.DEFAULT_MONGODB_USER,
        password=config.DEFAULT_MONGODB_PASSWORD):
    client = MongoClient(config.MONGODB_URL)
    try:
        client.the_database.authenticate(user, password, source=dbname)
    except:
        raise DBConnectionException('Cannot connect database')
    try:
        yield client[dbname]
    except Exception as e:
        raise e
    else:
        pass
    finally:
        client.close()


[문서]class ResourceManager:
    def __init__(self, dbname=config.DEFAULT_MONGODB_NAME):
        self.dbname = dbname

[문서]    def create(self, *args, **kwargs):
        raise NotImplementedError("create is not implemented.")

[문서]    def read(self, *args, **kwargs):
        raise NotImplementedError("read is not implemented.")

[문서]    def update(self, *args, **kwargs):
        raise NotImplementedError("update is not implemented.")

[문서]    def delete(self, *args, **kwargs):
        raise NotImplementedError("delete is not implemented.")

[문서]    def list(self, *args, **kwargs):
        raise NotImplementedError("list is not implemented.")


[문서]class TaskManager(ResourceManager):
[문서]    def create(self, identifier, query_group, query_type, query_count,\
            query_first, query_second, target_group, target_type, target_count,\
            target_first, target_second, relationship_type, description,\
            partial, astr_option, ystr_option, mtdna_option, dbname):
        self.identifier = identifier
        self.query_group = query_group
        self.query_type = query_type
        self.query_count = query_count
        self.query_first = query_first
        self.query_second = query_second
        self.target_group = target_group
        self.target_type = target_type
        self.target_count = target_count
        self.target_first = target_first
        self.target_second = target_second
        self.relationship_type = relationship_type
        self.description = description
        self.partial = partial
        self.astr_option = astr_option
        self.ystr_option = ystr_option
        self.mtdna_option = mtdna_option
        self.status = 'Submitted'

        with get_mongodb(self.dbname) as db:
            db.multiple_search_task.insert({'identifier': self.identifier,\
                    'query_group': self.query_group,\
                    'query_type' : self.query_type,\
                    'query_first' : self.query_first,\
                    'query_second' :  self.query_second,\
                    'query_count' : self.query_count,\
                    'target_group' : self.target_group,\
                    'target_type' : self.target_type,\
                    'target_count' : self.target_count,\
                    'target_first' : self.target_first,\
                    'target_second' : self.target_second,\
                    'status' : self.status,\
                    'registered_time': time.strftime('%Y%m%H%M%S'),\
                    'progress' : 0,\
                    'result' : None,\
                    'relationship_type': self.relationship_type,\
                    'description': self.description,\
                    'partial': self.partial,\
                    'astr_option': self.astr_option,\
                    'ystr_option': self.ystr_option,\
                    'mtdna_option': self.mtdna_option,\
                    })

[문서]    def get(self, identifier):
        with get_mongodb(self.dbname) as db:
            task = db.multiple_search_task.find_one({'identifier':identifier})
            if task is None:
                raise NoValueException('There is no task {0}'.\
                        format(identifier))
            return task

[문서]    def read(self, identifier):
        task = self.get(identifier)
        del(task['_id'])
        if task['result']:
            task['result'] = pd.DataFrame(json.loads(task['result'])).T
        return DataContainer(status='Success',
                            message='Fetching task identifier {}'.format(identifier),
                            items=task)

[문서]    def update(self, identifier, field, value):
        with get_mongodb(self.dbname) as db:
            db.multiple_search_task.update({'identifier': identifier},\
                    {"$set": {field: value}})

[문서]    def delete(self, identifier):
        with get_mongodb(self.dbname) as db:
            db.multiple_search_task.remove({'identifier': identifier})
        message = 'Deleted tasks {}'.format(identifier)
        return DataContainer(status='Success',
                                message=message)

[문서]    def list(self, identifier=None, query_group=None, query_type=None,\
            target_group=None, target_type=None, description=None, status=None,\
            sort='registered_time', desc=True, paginate_by=20,\
            page=None):
        ## check arguments
        if query_type is not None and\
                (query_type not in ['all', 'range', 'keyword', 'manual']):
            raise ParameterException('"{}" cannot be used for "query_type"'
                    ' value'.format(query_type))
        if sort not in ['identifier', 'query_group', 'query_type',\
                'query_count', 'target_group', 'target_type', 'target_count',\
                'status', 'progress', 'registered_time']:
            raise ParameterException('{} cannot be used for "sort" value'.\
                    format(sort))

        ## get task from mongodb
        mongodb_query = {}
        for k,v in {'identifier': identifier, 'query_group': query_group,\
                'query_type': query_type, 'target_group': target_group,\
                'target_type': target_type, 'description': description,\
                'status': status}.items():
            if v is not None:
                mongodb_query[k] = {'$regex': v}
        with get_mongodb(self.dbname) as db:
            tasks = db.multiple_search_task.find(mongodb_query)
        desc = -1 if desc else 1
        tasks.sort(sort, desc)
        results = list()
        for task in tasks:
            if '_id' in task:
                del(task['_id'])
            if 'result' in task:
                del(task['result'])
            results.append(task)
        total_count = len(results)
        if page is not None:
            page = int(page)
            results = results[(page-1)*paginate_by : page*paginate_by]
        return DataContainer(status='Success',
                                message='Fetching tasks',
                                totalCount=total_count,
                                totalPageCount=round(total_count/paginate_by),
                                items=results)

[문서]    def get_alltasks(self, field=None, query=None, sort='registered_time',\
            desc=True):
        desc = -1 if desc else 1
        with get_mongodb(self.dbname) as db:
            if field:
                tasks = db.multiple_search_task.find({field: {'$regex': query}})
            else:
                tasks = db.multiple_search_task.find()
        
        tasks.sort(sort, desc)
        return tasks


[문서]class GroupManager(ResourceManager):
[문서]    def create(self, identifier):
        pass

[문서]    def read(self, identifier):
        pass

[문서]    def update(self, identifier):
        pass

[문서]    def delete(self, identifier):
        pass

[문서]    def list(self):
        """
        return group list
        """
        pass

[문서]    def ids_belong_group(self, group):
        """
        return list of ids which belong to certain group
        """
        with get_mongodb(self.dbname) as db:
            ids = [s['identifier'] for s in\
                    db.genotypes.find({'groups': group}, {'identifier': 1})]
        return ids


[문서]class AlleleFrequenciesManager(ResourceManager):
    current_allele_frequency = {
            'astr_15': {'type_': 'A-STR', 'markers': 
                ['D8S1179', 'D21S11', 'D7S820', 'CSF1PO', 'D3S1358', 'TH01', 
                'D13S317', 'D16S539', 'D2S1338', 'D19S433', 'vWA', 'TPOX', 
                'D18S51', 'D5S818', 'FGA']},
            'astr_23': {'type_': 'A-STR', 'markers': 
                ['D8S1179', 'D21S11', 'D7S820', 'CSF1PO', 'D3S1358', 'TH01', 
                'D13S317', 'D16S539', 'D2S1338', 'D19S433', 'vWA', 'TPOX', 
                'D18S51', 'D5S818', 'FGA', 'Penta_E', 'Penta_D', 'D10S1248',
                'D12S391', 'D1S1656', 'D22S1045', 'D2S441', 'SE33']},
            'ystr_16': {'type_': 'Y-STR', 'markers': 
                ['DYS385', 'DYS389II', 'DYS392']},
            'ystr_23': {'type_': 'Y-STR', 'markers': 
                ['DYS385', 'DYS389II', 'DYS392', 'DYS635', 'GATAH4__1']},
    }

[문서]    def create(self, infile, informat, name, type_, description=None):
        result = self.add(infile, informat, name, type_, overwrite=False,\
                initial=False, description=description) ## return True or False

        if result is True:
            stat = 'Success'
            message = 'Success uploading {}'.format(name)
        else:
            stat = 'Failure'
            message = 'Failure uploading {}'.format(name)
        return DataContainer(status=stat, message=message)

[문서]    def read(self, name, type_):
        result = self.get_from_read(name, type_) ## return dictionary
        # result = self.get(name, type_) ## return dataframe
        return DataContainer(status='Success',\
                message='Success fetching Allele frequency {}'.format(name),\
                items=result,\
                trans=True)

[문서]    def update(self, infile, informat, name, type_, description=None):
        result = self.add(infile, informat, name, type_, overwrite=True,
                            initial=False, description=description) 
        if result is True:
            stat = 'Success'
            message = 'Success uploading {}'.format(name)
        else:
            stat = 'Failure'
            message = 'Failure uploading {}'.format(name)
        return DataContainer(status=stat, message=message)
                                                    ## return True or False
[문서]    def delete(self, name):
        result = self.delete_allele_frequencies(name) ## return None
        if result is True:
            stat = 'Success'
            message = 'Success deleting {}'.format(name)
        else:
            stat = 'Failure'
            message = 'Failure deleting {}'.format(name)
        return DataContainer(status=stat, message=message)

[문서]    def list(self):
        result = self.export_allele_frequency() ## return dataframe
        return DataContainer(status='Success',
                                message='',
                                items=result)

[문서]    def add(self, infile, informat, name, type_, overwrite=False,\
            initial=False, description=None):
#        if initial:
#            answer = input(
#                "'-t' option deletes all previous data. Is it right? (y/n) ")
#            if answer != 'y':
#                return
        with get_mongodb(self.dbname) as db:
            if initial:
                db.allele_frequencies.remove()
            prev_record = db.allele_frequencies.find_one({'name': name})
            already_exist = False
            if prev_record and type_ in prev_record.keys():
                already_exist = True
        if already_exist and not overwrite:
            raise AlreadyExistException('Allele frequencies {} already'
                    ' existed'.format(name))

        """
        ext = os.path.splitext(infile.name)[-1].lower()
        if ext == '.csv':
            data = pd.read_csv(infile, index_col=0)
        elif ext in ('.xls', '.xlsx'):
        """
        if informat.lower() == 'csv':
            data = pd.read_csv(infile, index_col=0)
        elif informat.lower() in ('xls', 'xlsx'):
            excel_file = pd.ExcelFile(infile)
            data = excel_file.parse(excel_file.sheet_names[0], index_col=0)
            #data = pd.read_excel(infile.name, 'Sheet1', index_col=0)
        else:
            raise FileTypeException('Input file must be CSV or XLS, XLSX')

        data.index = data.index * 10
        data.index = data.index.astype(object).astype(int).astype(str)
        data.columns = [utils.encode_marker(column, type_) for column in\
                data.columns]
        with get_mongodb(self.dbname) as db:
            if prev_record and \
                ((not already_exist) or (already_exist and overwrite)):
                db.allele_frequencies.update({'name': name}, {'$set': {
                    type_: data.to_dict(), 'description': description,
                }})
            else:
                db.allele_frequencies.insert({
                    'name': name,
                    type_: data.to_dict(),
                    'description': description,
                })
                
            return True
        return False

[문서]    def make_current_db(self):
        #answer = input(
        #        "Will you update current allele frequencies? (y/n) ")
        #if answer != 'y':
        #    return

        name = 'current'
        gm = GenotypeManager(dbname=self.dbname)
        for name in self.current_allele_frequency:
            type_ = self.current_allele_frequency[name]['type_']
            markers = self.current_allele_frequency[name]['markers']
            df = pd.DataFrame()
            try:
                genotypes = gm.get_genotypes(type_)
                genotypes = genotypes[markers]
                for marker in markers:
                    genotypes = genotypes[genotypes[marker].notnull()]
            except:
                continue
            for marker in markers:
                total_count = 0
                alleles_count = {}
                for allele in genotypes[marker]:
                    if type_ in ['A-STR','Y-STR']:
                        try:
                            allele_1, allele_2 = allele[0], allele[1]
                        except:
                            continue
                    elif type_ == 'mtDNA':
                        if not pd.isnull(allele):
                            allele_1, allele_2 = allele, allele
                        else:
                            continue
                    else:
                        try:
                            allele_1, allele_2 = allele[0], allele[0]
                        except:
                            continue
                    total_count += 2
                    try:
                        alleles_count[str(allele_1)][0] += 1
                    except:
                        alleles_count[str(allele_1)] = [1]
                    try:
                        alleles_count[str(allele_2)][0] += 1
                    except:
                        alleles_count[str(allele_2)] = [1]
                marker = utils.encode_marker(marker, type_)
                alleles_frequency = pd.DataFrame(alleles_count, 
                        index=[marker]) / float(total_count)
                df = df.append(alleles_frequency)
            if df:
                df = df.T
                with get_mongodb(self.dbname) as db:
                    if db.allele_frequencies.find({'name': name}).count() > 0:
                        db.allele_frequencies.update({'name': name}, 
                                {"$set": {type_: df.to_dict()}})
                    else:
                        db.allele_frequencies.insert({
                            'name': name, type_: df.to_dict()
                            })
        with get_mongodb(self.dbname) as db:
            if db.allele_frequencies.find({'name': name}).count() > 0:
                db.allele_frequencies.update({'name': name}, 
                    {"$set": {'description': 
                        'allele frequencies status in current database'+
                        ' ({})'.format(time.strftime(
                            '%Y/%m/%d %H:%M',time.localtime()
                            ))}})
        
[문서]    def get_names_info(self):
        name_and_info = []
        with get_mongodb(self.dbname) as db:
            for record in db.allele_frequencies.find():
                if 'A-STR' in record:
                    type_ = 'A-STR'
                elif 'Y-STR' in record:
                    type_ = 'Y-STR'
                elif 'mtDNA' in record:
                    type_ = 'mtDNA'
                af = pd.DataFrame.from_dict(record[type_])
                name_and_info.append([record['name'], type_, 
                    str(len(af.columns)), str(len(af.index))])
        return name_and_info

[문서]    def get(self, name, type_):
        with get_mongodb(self.dbname) as db:
            af = db.allele_frequencies.find_one(
                    {'name': name, type_: {"$exists":1}})
            if af is None:
                raise NoValueException('There is no Allele Frequency {0} {1}'.\
                            format(name, type_))
            if type_ == 'description':
                return af['description']

            # create dataframe
            df = pd.DataFrame.from_dict(af[type_])
            df.index = df.index.astype(int)
            df.sort()
            return df

[문서]    def get_from_read(self, name, type_):
        result = {}
        with get_mongodb(self.dbname) as db:
            try:
                af_types = list(db.allele_frequencies.find_one({'name': name}).\
                        keys())
            except AttributeError:
                raise NoValueException('AlleleFrequency {} does not exist'.\
                        format(name))
            if (type_ is not None) and (type_ not in af_types):
                return None
        af_types.remove('_id')
        af_types.remove('name')
        for type2 in af_types:
            df = self.get(name, type2)
            if type2 != 'description':
                df.index = [str(id_/10.0).replace('.0','') for id_ in\
                            df.index]
            result[type2] = df
        return result
    
    
[문서]    def export_allele_frequency(self):
        column_names = [\
                'name', 'type', 'number of markers', 'number of alleles',\
                'description']
        with get_mongodb(self.dbname) as db:
            result = list()
            for af in db.allele_frequencies.find():
                desc = af['description'] if 'description' in af else None
                for type_ in af:
                    if type_ not in ['name', '_id','description']:
                        af_df = pd.DataFrame.from_dict(af[type_])
                        result.append([
                                af['name'], 
                                type_, 
                                len(af_df.columns),
                                len(af_df.index),
                                desc
                            ])
        return None if not result else pd.DataFrame(result,\
                columns=column_names)
    
[문서]    def delete_allele_frequencies(self, name):
        with get_mongodb(self.dbname) as db:
            db.allele_frequencies.remove({'name': name})
        return True

[문서]    def show(self, name, type_, outfile, seperator='tab', excelfile=''):
        if seperator == 'tab':
            sep = '\t'
        elif seperator == 'comma':
            sep = ','
        df = self.get(name, type_)
        df.index = df.index / 10.
        if excelfile:
            df.to_excel(excelfile, sheet_name='Sheet1')
        else:
            df.to_csv(outfile, sep=sep)


[문서]class GenotypeManager(ResourceManager):
[문서]    def create(self, infile, informat='GeneMark', type_='A-STR', group=None):
        if informat not in ['GeneMark', 'CSV', 'TSV', 'XLSX']:
            raise ParameterException('informat argument has wrong value')
        result = self.add_file(infile, type_, format=informat,
            overwrite=False, initial=False, group=group) ## return True

        if result is True:
            stat = 'Success'
            message = 'Success uploading {}'.format(type_)
        else:
            stat = 'Failure'
            message = 'Failure uploading {}'.format(type_)
        return DataContainer(status=stat, message=message)

[문서]    def read(self, identifier, type_=None, kit='All', null='-'):
        result = self.get_genotype(identifier)

        del(result['_id'])
        del(result['identifier'])
        if not result:
            raise NoValueException('identifier {} does not exist'.format(\
                    identifier))

        if type_ is not None:
            if type_ not in result:
                return DataContainer(status='Success',\
                        message='There is no {} related identifier'.format(\
                        type_))
                # return None
            result = {type_: result[type_]}
        types = list(result.keys())
       
        for type_ in types:
            if type_ == 'A-STR':
                gc = utils.ASTRGenotypeCleaner(result[type_], is_encoded=True)
                astrs = gc.decode()
                result[type_] = [({marker:astrs[marker]} if (marker in astrs)\
                        else {marker:null}) for marker in\
                        config.MARKERS_IN_KITS['A-STR'][kit]]
            elif type_ == 'Y-STR':
                gc = utils.YSTRGenotypeCleaner(result[type_], is_encoded=True)
                ystrs = gc.decode()
                result[type_] = [({marker:ystrs[marker]} if (marker in ystrs)\
                        else {marker:null}) for marker in\
                        config.MARKERS_IN_KITS['Y-STR'][kit]]
            elif type_ == 'mtDNA':
                gc = utils.MtdnaGenotypeCleaner(result[type_], is_encoded=True)
                mtdnas = gc.decode()
                result[type_] = mtdnas
            else:
                pass
        return DataContainer(status='Success',\
                message='Success fetching Genotype {}'.format(identifier),\
                items=result, trans=True)

[문서]    def update(self, file_or_individual, *args, **kwargs):
        if file_or_individual == 'file':
            return self.update_by_file(*args, **kwargs)
        elif file_or_individual == 'individual':
            return self.update_by_genotype(*args, **kwargs)

[문서]    def update_by_file(self, infile, informat='GeneMark', type_='A-STR',\
             save_by='replacement', group=None):
        infile = iter(infile.readlines())

        ## check save_bys
        if save_by not in ['replacement', 'merge-with-overwrite',\
                'merge-without-overwrite']:
            raise ParameterException('save_by argument has wrong value')
        if informat.lower() not in ['genemark', 'csv', 'tsv', 'xlsx']:
            raise ParameterException('informat argument has wrong value')

        ## parse file to make dataframe
        if informat.lower() == 'csv':
            df = pd.read_csv(infile, encoding='cp949', index_col=['identifier'])
            df = df.astype(object).fillna('').astype(str)
            df.columns = [str(c).replace('.0', '') for c in df.columns]
        if informat.lower() == 'tsv':
            df = pd.read_csv(infile, encoding='cp949',\
                    index_col=['identifier'], sep='\t')
            df = df.astype(object).fillna('').astype(str)
            df.columns = [str(c).replace('.0', '') for c in df.columns]
        elif informat.lower() == 'xlsx':
            excel_file = pd.ExcelFile(infile)
            df = excel_file.parse(excel_file.sheet_names[0])#, index_col=0)
            df = df.set_index('identifier')
            df = df.astype(object).fillna('').astype(str)
            df.columns = [str(c).replace('.0', '') for c in df.columns]
        elif informat.lower() == 'genemark':
            if type_ == 'mtDNA':
                df = self.mtdna_genmark_txt_to_df(infile)
            elif type_ in ['A-STR', 'Y-STR']: 
                df = self.genmark_txt_to_df(infile, type_)
            df = df.set_index('identifier')
        
        if type_ == 'mtDNA':
            for m in df.columns:
                m_ori = utils.decode_marker(m, 'mtDNA')
                m = float(m_ori)
                if not ((16024 <= m <= 16365) | (73 <= m <= 340) |\
                        (438 <= m <= 574)):
                    raise ParameterException('marker "{}" cannot used'.\
                            format(m_ori))
        ## update database
        for identifier, series in df.iterrows():
            series = series[series.notnull()]
            genotype = utils.encode_genotype(series.to_dict(), type_, None)
            if save_by == 'replacement':
                self.set_genotype(identifier, type_, genotype)
            elif save_by == 'merge-with-overwrite':
                self.set_genotype_merge_with_overwrite(identifier, type_,\
                        genotype)
            elif save_by == 'merge-without-overwrite':
                self.set_genotype_merge_without_overwrite(identifier, type_,\
                        genotype)

        stat = 'Success'
        message = 'Success uploading {}'.format(type_)
        return DataContainer(status=stat, message=message)
 
[문서]    def update_by_genotype(self, type_, identifier, genotype, 
                                            save_by='merge-with-overwrite'):
        if save_by not in ['merge-with-overwrite']:
            raise ParameterException('save_by argument has wrong value')
        if type_.lower() == 'mtDNA':
            for m,v in genotype.items():
                if not ((16024 <= m <= 16365) | (73 <= m <= 340) |\
                        (438 <= m <= 574)):
                    raise ParameterException('marker "{}" cannot used'.\
                            format(m))
        encoded_genotype = {utils.encode_marker(m, type_):\
                utils.encode_alleles(v, type_, utils.encode_marker(m, type_))\
                for m,v in genotype.items()}
        self.set_genotype_merge_with_overwrite(identifier, type_,\
                encoded_genotype)
        stat = 'Success'
        message = 'Success uploading {}'.format(type_)
        return DataContainer(status=stat, message=message)

[문서]    def delete(self, identifier, genotype='all'):
        # result = self.delete_genotype(identifier) ## return None
        result = self.remove_genotype(identifier, genotype)

        if result is True:
            stat = 'Success'
            message = 'Success deleting {}'.format(identifier)
        else:
            stat = 'Failure'
            message = 'Failure deleting {}'.format(identifier)
        return DataContainer(status=stat, message=message)

[문서]    def list(self, type_=None, identifier=None, page=None, paginate_by=10,\
            sort=None, desc=False, filter_astr=False, filter_ystr=False,\
            filter_mtdna=False):
        if type_ is None:
            temp_results = []
            with get_mongodb(self.dbname) as db:
                if identifier:
                    find_result = db.genotypes.find({'identifier':\
                            {'$regex': identifier}})
                else:
                    find_result = db.genotypes.find() 
                for sample in find_result:
                    id_ = sample['identifier']
                    del(sample['_id'])
                    del(sample['identifier'])
                    temp_result = {}
                    for key in sample:
                        if key == 'groups':
                            temp_result[key] = ','.join(sample[key])
                        else:
                            temp_result[key] = len([marker for marker in\
                                    sample[key] if sample[key][marker] is\
                                    not None])
                    temp_results.append([id_, temp_result])
                if not temp_results:
                    return DataContainer(status='Success',\
                            message='There is no Genotype "{}"'.\
                            format(type_), items=[])
                result = pd.DataFrame.from_items(temp_results).T
            result = result.replace(np.nan, 0)
            if filter_astr:
                result = result.loc[(result['A-STR']!=0)]
            if filter_ystr:
                result = result.loc[(result['Y-STR']!=0)]
            if filter_mtdna:
                result = result.loc[(result['mtDNA']!=0)]
                
        else:
            result = self.get_genotypes(type_)
        

        total_count = result.shape[0]
        if sort is not None:
            if sort == 'identifier':
                index = list(result.index)
                index.sort(reverse=not desc)
                result = result.loc[index]
            else:
                result = result.sort(sort, ascending=not desc)
        if page is not None:
            result = result.iloc[(page-1)*paginate_by : page*paginate_by]
        if type_ is not None:
            result = {type_:result}
        return DataContainer(status='Success',
                                message='Success fetching Genotype {}'.\
                                        format(type_),
                                items=result,
                                totalCount=total_count,
                                totalPageCount=round(total_count/paginate_by),
                                trans=False,
                                zeroization=True)


[문서]    def delete_genotype(self, identifier):
        with get_mongodb(self.dbname) as db:
            db.genotypes.remove({'identifier': identifier})
        return True

[문서]    def get_genotype(self, identifier):
        with get_mongodb(self.dbname) as db:
            genotype = db.genotypes.find_one({'identifier': identifier})
        new_genotype = {}
        if genotype:
            for type_, type_genotype in genotype.items():
                if type_genotype in ('A-STR', 'Y-STR', 'mtDNA'):
                    g = {}
                    for marker, alleles in type_genotype.items():
                        #marker = utils.decode_marker(marker, type_)
                        g[markers] = alleles
                    new_genotype[type_] = g
                else:
                    new_genotype[type_] = type_genotype
        else:
            raise AssertionError('There is no such identifier {}'.format(
                    identifier))
        return new_genotype

[문서]    def get_genotype_with_none(self, identifier):
        genotype = self.get_genotype(identifier)
        if genotype:    
            for type_ in ('A-STR', 'Y-STR'):
                if type_ in genotype:
                    for marker, alleles in genotype[type_].items():
                        if type(alleles) != list:
                            genotype[type_][marker] = None
            if 'mtDNA' in genotype:
                for position, allele in genotype['mtDNA'].items():
                    if pd.isnull(allele):
                        genotype['mtDNA'][position] = None
            for type_ in ('A-STR', 'Y-STR', 'mtDNA'):
                if type_ in genotype:
                    if not any(genotype[type_].values()):
                        genotype.pop(type_)
            return genotype

[문서]    def remove_genotype(self, identifier, genotype='all'):
        if genotype != 'all':
            sample_genotype = self.get_genotype(identifier)
            for key in ['_id', genotype]:
                try:
                    sample_genotype.pop(key)
                except KeyError:
                    pass
        with get_mongodb(self.dbname) as db:
            record = db.genotypes.find_one({'identifier':identifier})
            if not record:
                raise NoValueException('identifier {} does not exist'.format(\
                        identifier))
            db.genotypes.remove({'identifier' : identifier})
            if genotype != 'all':
                db.genotypes.insert({k:v for k,v in sample_genotype.items()})
        return True


[문서]    def set_identifier(self, identifier):
        with get_mongodb(self.dbname) as db:
            db.genotypes.insert({'identifier':identifier})


[문서]    def set_genotype(self, identifier, type_, genotype):
        with get_mongodb(self.dbname) as db:
            db.genotypes.update({'identifier': identifier}, {'$set': {
                    type_: genotype,
            }}, True)

[문서]    def set_genotype_merge_with_overwrite(self, identifier, type_, genotype):
        with get_mongodb(self.dbname) as db:
            merged_genotype = {}
            # prev = db.genotypes.find_one({'identifier': identifier})[type_]
            try:
                prev = db.genotypes.find_one({'identifier': identifier})[type_]
            except:
                raise NoValueException(
                    'There is no identifier "{0}" or "{1}"'.format(identifier, type_))
            if prev:
                all_markers = set(list(prev.keys()) + list(genotype.keys()))
            else:
                all_markers = list(genotype.keys())
            for marker in all_markers:
                merged_genotype[marker] =\
                        genotype[marker] if (marker in genotype and\
                        genotype[marker]) else prev[marker]
            db.genotypes.update({'identifier': identifier}, {'$set': {
                    type_: merged_genotype,
            }}, True)

[문서]    def set_genotype_merge_without_overwrite(self, identifier, type_, genotype):
        with get_mongodb(self.dbname) as db:
            merged_genotype = {}
            # prev = db.genotypes.find_one({'identifier': identifier})[type_]
            try:
                prev = db.genotypes.find_one({'identifier': identifier})[type_]
            except:
                raise NoValueException(
                    'There is no identifier "{0}" or "{1}"'.format(identifier, type_))
            if prev:
                all_markers = set(list(prev.keys()) + list(genotype.keys()))
            else:
                all_markers = list(genotype.keys())
            for marker in all_markers:
                merged_genotype[marker] =\
                        prev[marker] if (marker in prev and prev[marker])\
                        else genotype[marker]
            db.genotypes.update({'identifier': identifier}, {'$set': {
                    type_: merged_genotype,
            }}, True)


[문서]    def add_group(self, identifiers, group):
        with get_mongodb(self.dbname) as db:
            for identifier in identifiers:
                record = db.genotypes.find_one({'identifier': identifier})
                if record:
                    if 'groups' not in record:
                        record['groups'] = []
                    if group not in record['groups']:
                        record['groups'].append(group)
                else:
                    record = {'groups':[group]}
                db.genotypes.update({'identifier': identifier}, {'$set': {
                    'groups': record['groups'],
                    }})

[문서]    def get_genotypes(self, type_):
        with get_mongodb(self.dbname) as db:
            cursor = db.genotypes.find({type_:{"$exists":1}}, 
                    {'identifier': 1, type_: 1})
            genotypes = pd.DataFrame.from_items(
                    (g['identifier'], g[type_]) for g in cursor).T
        if type_ == 'mtDNA':
            new_columns = []
            for c in genotypes.columns:
                #new_columns.append(utils.decode_marker(c, type_))
                new_columns.append(c)
            genotypes.columns = new_columns
        return genotypes

[문서]    def get_genotypes_by_group(self, type_, group):
        with get_mongodb(self.dbname) as db:
            cursor = db.genotypes.find(
                    {type_:{"$exists":1}, 'groups':{"$in":[group]}}, 
                    {'identifier': 1, type_: 1, 'groups': 1})
            genotypes = pd.DataFrame.from_items(
                    (g['identifier'], g[type_]) for g in cursor).T
        return genotypes

[문서]    def set_initial(self):
#        answer = input(
#            "'-t' option deletes all previous data. Is it right? (y/n) ")
#        if answer != 'y':
#            return
        with get_mongodb(self.dbname) as db:
            db.genotypes.remove()
    
[문서]    def add_file(self, infile, type_, format='GeneMark', overwrite=False,\
            initial=False, group=None):
        if initial:
            self.set_initial()
        if type_ == 'mtDNA': 
            if format == 'GeneMark':
                self._add_mtDNA_genemark_file(infile,  
                        overwrite=overwrite, group=group)
            elif format == 'JSON':
                self._add_mtDNA_JSON_file(infile, 
                        overwrite=overwrite, group=group)
            elif format == 'CSV':
                self._add_mtDNA_csv_file(infile, 
                        overwrite=overwrite, group=group)
            else:
                raise FileTypeException('Input file must be CSV, JSON or\
                        GeneMark')
        else:
            if format == 'GeneMark':
                self._add_file_genemark(infile, type_,\
                        overwrite=overwrite, group=group)
            elif format == 'JSON':
                self._add_file_json(infile, type_, overwrite=overwrite, 
                        group=group)
            elif format == 'CSV':
                self._add_file_csv(infile, type_, overwrite=overwrite,
                        group=group)
            else:
                raise FileTypeException('Input file must be CSV, JSON or\
                        GeneMark')
        return True

    def _add_mtDNA_genemark_file(self, infile, overwrite=False, 
            group=None):
        type_ = 'mtDNA'
        headers = []
        bodies = []
        infile = iter(infile.readlines())
        
        for line in infile:
            if type(line) != str:
                line = line.decode('utf-8')
            sp = line.strip().split('\t')
            if sp[0] == 'Specimen':
                headers = sp
                line = next(infile)
                if type(line) != str:
                    line = line.decode('utf-8')
                bodies = line.strip().split('\t')
                break
        if headers == []:
            raise AssertionError('There is no Specimen in Genmapper file.')
        elif len(headers) != len(bodies):
            raise AssertionError('The number of header and body is different.')

        project = bodies[0]
        mtDNA = {}
        for i in range(1,len(headers)):
            try:
                trans, position, area = headers[i].split('___')
            except ValueError:
                raise Exception('marker name "{}" is not proper in Genmapper '
                        'file.'.format(headers[i]))
            try:
                gene_code, remainder = bodies[i].split('  ')
            except ValueError:
                raise Exception('genotype "{}" is not proper in Genmapper '
                        'file.'.format(bodies[i]))
            if gene_code == '-':
                gene_code = 'd'
            if trans == '-':
                additional_num = 1
                start_index = headers.index(headers[i])
                while headers.index(headers[i], start_index) != i:
                    additional_num += 1
                    start_index = headers.index(headers[i], start_index+1)
                position = position + '.{}'.format(additional_num)

            mtDNA[position] = gene_code

        with get_mongodb(self.dbname) as db:
            prev_record = \
                db.genotypes.find_one({'identifier':project})
            if prev_record:
                if overwrite or (type_ not in prev_record.keys()):
                    db.genotypes.update({'identifier':project},
                            {'$set': {type_:mtDNA}})
                else:
                    raise AlreadyExistException('Uploaded mtDNA example\
                            already existed')
            else:
                db.genotypes.insert({
                    'identifier': project,
                    type_: mtDNA})
            
            if group:
                try:
                    prev_group = db.genotypes.find_one(
                        {'identifier': project})['groups']
                except:
                    prev_group = []
                if group not in prev_group:
                    prev_group.append(group)
                    db.genotypes.update({'identifier': project},
                        {"$set": {'groups': prev_group}})


    def _add_mtDNA_csv_file(self, infile, overwrite=False, group=None):
        csv_reader = csv.reader(infile)
        columns = next(csv_reader)
        type_ = 'mtDNA'
        with get_mongodb(self.dbname) as db:
            for record in csv_reader:
                identifier = record[0]
                motif_dict = {}
                for hv in record[1:4]:
                    for motif_ in hv.split():
                        if motif_.startswith('불'):
                            continue
                        if motif_ == '16183C16189C':
                            motif_ = '16183C,16189C'
                        if motif_ == '309.1':
                            motif_ = '309.1C'
                        if motif_ == '235G315.1C':
                            motif_ = '235G,315.1C'
                        if motif_ == '315C.1C':
                            motif_ = '315.1C'
                        if motif_ == '16209Y16218T':
                            motif_ = '16209Y,16218T'
                        if '.p' in motif_:
                            int_part = motif_.split('.p')[0]
                            temp_series = [m for m in hv.split() if\
                                    int_part in m]
                            temp_series.remove(motif_)
                            temp_series.sort()
                            max_series_num = int(temp_series[-1][-2])
                            motif_ = '{}.{}{}'.format(int_part,max_series_num+1,
                                    motif_[-1])
                            
                        for motif in [m for m in motif_.split(',') if m]:
                            motif = motif.replace('?', '')
                            if motif.endswith('.'):
                                motif = motif.replace('.', '')
                            if motif[-1] in '1234567890':
                                motif = motif[:-1]
                            """
                            if '.' in motif:
                                motif_splited = motif.split('.')
                                position = motif_splited[0]
                                snp = motif_splited[1]
                                if position[-1] not in '1234567890':
                                    position = position[:-1]
                            else:
                                position = motif[:-1]
                                snp = motif[-1]
                            """
                            position = motif[:-1]
                            try:
                                int_position = float(position)
                            except:
                                print("Not int: {}, {}".format(identifier, 
                                    position))
                            #position = position.replace('.','__')
                            position = utils.encode_marker(position, type_)
                            snp = motif[-1]
                            motif_dict[position] = snp

                prev_record = db.genotypes.find_one({'identifier': identifier})
                if prev_record:
                    if overwrite or (type_ not in prev_record):
                        db.genotypes.update({'identifier': identifier},
                                {'$set': {type_: motif_dict}})
                else:
                    db.genotypes.insert({
                        'identifier': identifier,
                        type_: motif_dict,
                        })

                if group:
                    try:
                        prev_group = db.genotypes.find_one(
                            {'identifier': prev_identifier})['groups']
                    except:
                        prev_group = []
                    if group not in prev_group:
                        prev_group.append(group)
                        db.genotypes.update({'identifier': identifier},
                            {"$set": {'groups': prev_group}})


    def _add_mtDNA_JSON_file(self, infile, overwrite=False, group=None):
        pass

    def _add_file_genemark(self, infile, type_, overwrite=False, group=None):
        infile = infile.readlines()
        infile = [line.decode('utf-8') if type(line) != str else line\
                for line in infile]

        prev_identifier = ''
        markers = []; allele1s = []; allele2s = []
        columns = [infile[0].split('\t').index(head) for head in\
                ['Sample Name', 'Marker', 'Allele 1', 'Allele 2']]

        with get_mongodb(self.dbname) as db:
            for line in infile[1:]:
                words = line.strip().split('\t')
                identifier = words[columns[0]]
                marker = utils.encode_marker(words[columns[1]], type_)
                allele1 = words[columns[2]]
                allele2 = words[columns[3]]
                if not allele2:
                    allele2 = allele1

                if not prev_identifier or prev_identifier == identifier:
                    markers.append(marker)
                    allele1s.append(allele1)
                    allele2s.append(allele2)
                else:
                    marker_dict = {}
                    for i, marker in enumerate(markers):
                        alleles = allele1s[i], allele2s[i]
                        marker_dict[marker] = utils.encode_alleles(
                                alleles, type_, marker)
                    prev_record = \
                        db.genotypes.find_one({'identifier': identifier})
                    if prev_record:
                        if overwrite or (type_ not in prev_record.keys()):
                            db.genotypes.update({'identifier': prev_identifier},
                                    {'$set': {type_: marker_dict}})
                        else:
                            raise AlreadyExistException('Uploaded astr\
                                    example already existed')
                    else:
                        db.genotypes.insert({
                            'identifier': prev_identifier,
                            type_: marker_dict})
                    
                    if group:
                        try:
                            prev_group = db.genotypes.find_one(
                                {'identifier': prev_identifier})['groups']
                        except:
                            prev_group = []
                        if group not in prev_group:
                            prev_group.append(group)
                            db.genotypes.update({'identifier': prev_identifier},
                                    {"$set": {'groups': prev_group}})
                    
                    markers = [marker]
                    allele1s = [allele1]
                    allele2s = [allele2]

                prev_identifier = identifier

            marker_dict = {}
            for i, marker in enumerate(markers):
                alleles = allele1s[i], allele2s[i]
                marker_dict[marker] = utils.encode_alleles(
                        alleles, type_, marker)

            prev_record = \
                db.genotypes.find_one({'identifier': identifier})
            if prev_record:
                if overwrite or (type_ not in prev_record.keys()):
                    db.genotypes.update({'identifier': prev_identifier},
                            {'$set': {type_: marker_dict}})
            else:
                db.genotypes.insert({
                    'identifier': prev_identifier,
                    type_: marker_dict,
                })
            
            if group:
                try:
                    prev_group = db.genotypes.find_one(
                        {'identifier': prev_identifier})['groups']
                except:
                    prev_group = []
                if group not in prev_group:
                    prev_group.append(group)
                    db.genotypes.update({'identifier': prev_identifier},
                        {"$set": {'groups': prev_group}})
               
    def _add_file_csv(self, infile, type_, overwrite=False, group=None):
        data = pd.read_csv(infile, index_col=0)
        with get_mongodb(self.dbname) as db:
            for identifier, str_ in data.iterrows():
                try:
                    prev_record =\
                            db.genotypes.find_one({'identifier':identifier})
                    if prev_record:
                        if overwrite or (type_ not in prev_record.keys()):
                            db.genotypes.update({'identifier':identifier},
                                    {'$set': {type_:
                                        utils.encode_genotype(str_.to_dict(),\
                                                type_)
                                    }})
                    else:
                        db.genotypes.insert({
                            'identifier': identifier,
                            type_: utils.encode_genotype(str_.to_dict(), type_)
                            })
                except:
                    print('ERROR in ', identifier,'\n', str_)

                if group:
                    try:
                        prev_group = db.genotypes.find_one(
                            {'identifier': identifier})['groups']
                    except:
                        prev_group = []
                    if group not in prev_group:
                        prev_group.append(group)
                        db.genotypes.update({'identifier': identifier},
                            {"$set": {'groups': prev_group}})

    def _add_file_json(self, infile, type_, overwrite=False, group=None):
        genotypes_json = json.loads(infile.read())
        with get_mongodb(self.dbname) as db:
            for genotype_json in genotypes_json:
                identifier = genotype_json['identifier']
                if type_ in genotype_json:
                    str_ = genotype_json[type_]
                    for marker_name, alleles in str_.items():
                        str_[marker_name] = utils.encode_alleles(
                                alleles, type_, marker_name)
                prev_record = \
                    db.genotypes.find_one({'identifier':identifier})
                if prev_record:
                    if overwrite or (type_ not in prev_record.keys()):
                        db.genotypes.update({'identifier': identifier}, 
                                {'$set': {type_: str_}})
                    else:
                        print('{} already exist, '
                              'you can use overwrite option'.format(identifier))
                        continue
                else:
                    db.genotypes.insert({
                        'identifier': identifier,
                        type_: str_,
                        })
                if group: 
                    try: 
                        prev_group = db.genotypes.find_one( 
                                {'identifier': prev_identifier})['groups'] 
                    except: 
                        prev_group = [] 
                    if group not in prev_group: 
                        prev_group.append(group) 
                        db.genotypes.update({'identifier': prev_identifier},
                                {"$set": {'groups': prev_group}})


[문서]    def genmark_txt_to_df(self, txtfile_or_path, type_):
        if type(txtfile_or_path) == str:
            txtfile = open(txtfile_or_path, encoding='utf-8')
        else:
            txtfile = txtfile_or_path
        genotypes = []
        column_line = next(txtfile)
        if type(column_line) != str:
            column_line = column_line.decode('utf-8')
        column_list = column_line.strip().split('\t')

        column_position = {column:position for position, column in\
                enumerate(column_list)}
        for essential_column in ['Sample Name','Marker','Allele 1','Allele 2']:
            if essential_column not in column_list:
                raise ParsingException('There is no "{}" column in Genmapper\
                        file.'.format(essential_column))
        index_sample_name = column_position['Sample Name']
        index_marker = column_position['Marker']
        index_allele_1 = column_position['Allele 1']
        index_allele_2 = column_position['Allele 2']
        max_index = max([index_sample_name, index_marker, index_allele_1,\
                index_allele_2])
        prev_identifier = ''
        genotype = {}
        line_number = 1
        for line in txtfile:
            line_number += 1
            if type(line) != str:
                line = line.decode('utf-8')
            line = line.strip()
            if not line or line.startswith('Sample Name'):
                continue
            words = line.split('\t')
            if len(words) < max_index:
                continue
            identifier = words[index_sample_name]
            try:
                marker = words[index_marker]
            except IndexError:
                raise ParsingException('There is no "marker" in Genmapper file'
                        ' line {}.'.format(line_number))
            marker = utils.encode_marker(marker, type_)
            if marker in config.IGNORE_MARKERS[type_]:
                continue
            try:
                allele1 = words[index_allele_1]
            except IndexError:
                raise ParsingException('There is no "allele1" in Genmapper file'
                        ' line {}.'.format(line_number))
            try:
                allele2 = words[index_allele_2]
            except IndexError:
                raise ParsingException('There is no "allele2" in Genmapper file'
                        ' line {}.'.format(line_number))
            if allele2 == '':
                allele2 = allele1
            if prev_identifier and prev_identifier != identifier:
                genotype['identifier'] = prev_identifier
                genotypes.append(genotype)
                genotype = {}
            if marker in config.SEX_MARKERS:
                genotype.update({
                    marker: '{}{}'.format(allele1, allele2),
                })
            else:
                genotype.update({
                    marker: '{}, {}'.format(allele1, allele2),
                })
            prev_identifier = identifier
        if prev_identifier and genotype:
            genotype['identifier'] = prev_identifier
            genotypes.append(genotype)
        columns = []
        for genotype in genotypes:
            columns += [key for key in genotype]
        columns = list(set(columns))
        columns.remove('identifier')
        columns = ['identifier'] + columns
        df = pd.DataFrame(genotypes, columns=columns)
        return df


[문서]    def mtdna_genmark_txt_to_df(self, txtfile_or_path):
        if type(txtfile_or_path) == str:
            txtfile = open(txtfile_or_path, encoding='utf-8')
        else:
            txtfile = txtfile_or_path
        headers = []
        bodies = []
        for line in txtfile:
            if type(line) != str:
                line = line.decode('utf-8')
            sp = line.strip().split('\t')
            if sp[0] == 'Specimen':
                headers = sp
                line = next(txtfile)
                if type(line) != str:
                    line = line.decode('utf-8')
                bodies = line.strip().split('\t')
                break
        if headers == []:
            raise ParsingException('There is no Specimen in Genmapper file.')
        elif len(headers) != len(bodies):
            raise ParsingException('The numbers of header and body are\
                    different.')
        
        columns = ['identifier']
        values = [bodies[0]]
        for i in range(1,len(headers)):
            try:
                trans, position, area = headers[i].split('___')
            except ValueError:
                raise ParsingException('marker name "{}" is not proper\
                        in Genmapper file.'.format(headers[i]))
            try:
                gene_code, remainder = bodies[i].split('  ')
            except ValueError:
                raise ParsingException('genotype "{}" is not proper in\
                        Genmapper file.'.format(bodies[i]))
            if gene_code == '-':
                gene_code = 'd'
            if trans == '-':
                additional_num = 1
                start_index = headers.index(headers[i])
                while headers.index(headers[i], start_index) != i:
                    additional_num += 1
                    start_index = headers.index(headers[i], start_index+1)
                position = position + '.{}'.format(additional_num)
            columns.append(utils.encode_marker(position, 'mtDNA'))
            values.append(gene_code)
        df = pd.DataFrame(pd.Series(values)).T
        df.columns = columns

        return df