model.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import os
import re
from typing import Dict, List, Set


def check_format(json_file, check_for=dict):

    if isinstance(json_file, check_for):
        return json_file

    elif isinstance(json_file, str):
        if os.path.exists(json_file):

            with open(json_file, 'r') as jf:
                return json.load(jf)

        else: 
            return json.loads(json_file)

    else:
        raise TypeError('Input not convertible.')


def from_json(json_file):

    if hasattr(json_file, 'read'):

        verses = json.loads(json_file.read())

    elif isinstance(json_file, str) and os.path.exists(json_file):

        verses = json.loads(open(json_file).read())

    else:

        TypeError('Input not convertible.')

    return [Verse.from_json(verse) for verse in verses]


def minimal(full_dict:dict):
    #print(full_dict)
    result_dict = dict()
    for key,value in full_dict.items():
        if value == {}:
            pass
        elif isinstance(value, dict):
            result_dict.update({key:minimal(value)})
        elif value != None:
            result_dict.update({key:value})
        else:
            pass
    #print(result_dict)
    return result_dict

class Syllable:

    def __init__(self, syllable: str, span: List[int], idx: int,
                 syllable_length: int, vowel_length: int,
                 phenomena: dict = dict()):
        if len(syllable) != span[1] - span[0]:
            raise ValueError('Syllable length does not match syllable span.')
        else:
            self.text = syllable
            self.span = span
            self.id = idx
            self.syllable_length = syllable_length
            self.vowel_length = vowel_length
            self.phenomena = phenomena

    @classmethod
    def from_json(cls, json_file):
        raw = check_format(json_file)

        idx = raw['id']
        span = raw['span']
        text = raw['syllable']
        syllable_length = raw['syllable_length']
        vowel_length = raw['vowel_length']
        syllable = cls(text, span, idx, syllable_length, vowel_length)

        if 'phenomena' in raw:
            syllable.phenomena = dict()
            for phenomenon in raw['phenomena'].items():
                syllable.phenomena[phenomenon[0]] = Phenomenon.from_json(phenomenon[1])

        return syllable

    def to_dict(self):

        features = dict()

        features.update({'id':self.id})
        features.update({'span':self.span})
        features.update({'syllable':self.text})
        features.update({'syllable_length':self.syllable_length})
        features.update({'vowel_length':self.vowel_length})
        features.update({'phenomena': minimal({key:value.to_dict() for key,value in self.phenomena.items()}) })

        return minimal(features)

    def to_json(self):

        return json.dumps(self.to_dict())

class Phenomenon:

    def __init__(self, caused_by=None, overruled_by=None,
                 chars=None, typus=None, omitted=None):
        self.caused_by = caused_by
        self.overruled_by = overruled_by

        self.chars = chars
        self.typus = typus
        self.omitted = omitted

    #@classmethod
    #def positional_lengthening(cls, chars: str, caused_by=None,
                               #overruled_by=None):
        #phenomenon = cls('positional lengthening', caused_by, overruled_by)
        #phenomenon.chars = chars

        #return phenomenon

    #@classmethod
    #def iambic_shortening(cls, typus: str, caused_by=None, overruled_by=None):
        #phenomenon = cls('iambic shortening', caused_by, overruled_by)
        #phenomenon.typus = typus

        #return phenomenon

    #@classmethod
    #def s_elision(cls, caused_by=None, overruled_by=None):
        #phenomenon = cls('s-elision', caused_by, overruled_by)
        #phenomenon.omitted = 's'

        #return phenomenon

    #@classmethod
    #def verse_end(cls, caused_by=None, overruled_by=None):
        #phenomenon = cls('verse end', caused_by, overruled_by)

        #return phenomenon

    @classmethod
    def from_json(cls, json_file):
        raw = check_format(json_file)

        phenomenon = cls()

        if 'caused_by' in raw:
            phenomenon.caused_by = raw['caused_by']
        if 'overruled_by' in raw:
            phenomenon.overruled_by = raw['overruled_by']
        if 'chars' in raw:
            phenomenon.chars = raw['chars']
        if 'typus' in raw:
            phenomenon.typus = raw['typus']
        if 'omitted' in raw:
            phenomenon.omitted = raw['omitted']

        return phenomenon

    def to_dict(self):

        features = dict()

        if self.caused_by != None:
            features.update({'caused_by':self.caused_by})
        if self.overruled_by != None:
            features.update({'overruled_by':self.overruled_by})
        if self.chars != None:
            features.update({'chars':self.chars})
        if self.typus != None:
            features.update({'typus':self.typus})
        if self.omitted != None:
            features.update({'omitted':self.omitted})

        return minimal(features)

    def to_json(self):

        return json.dumps(self.to_dict())

class MultisyllablePhenomenon(Phenomenon):

    def __init__(self, beginning:int, end:int, caused_by=None, 
                 overruled_by=None, chars=None, typus=None, omitted=None):
        Phenomenon.__init__(self, caused_by, overruled_by,
                            chars, typus, omitted)
        self.beginning = beginning
        self.end = end

    #def apheresis(self, beginning, end, caused_by=None, overruled_by=None):
        #MultisyllablePhenomenon.__init__(self, 'apheresis', beginning, end,
                                         #caused_by, overruled_by)
                                         
    #def synizesis(self, beginning, end, caused_by=None, overruled_by=None):
        #MultisyllablePhenomenon.__init__(self, 'synizesis', beginning, end,
                                         #caused_by, overruled_by)

    @classmethod
    def from_json(cls, json_file):

        raw = check_format(json_file)

        beginning = raw['beginning']
        end = raw['end']

        phenomenon = cls(beginning, end)

        if 'caused_by' in raw:
            phenomenon.caused_by = raw['caused_by']
        if 'overruled_by' in raw:
            phenomenon.overruled_by = raw['overruled_by']
        if 'chars' in raw:
            phenomenon.chars = raw['chars']
        if 'typus' in raw:
            phenomenon.typus = raw['typus']
        if 'omitted' in raw:
            phenomenon.omitted = raw['omitted']

        return phenomenon


    def to_dict(self):

        features = dict()

        features.update({'beginning':self.beginning})
        features.update({'end':self.end})

        if self.caused_by != None:
            features.update({'caused_by':self.caused_by})
        if self.overruled_by != None:
            features.update({'overruled_by':self.overruled_by})
        if self.chars != None:
            features.update({'chars':self.chars})
        if self.typus != None:
            features.update({'typus':self.typus})
        if self.omitted != None:
            features.update({'omitted':self.omitted})

        return minimal(features)

    def to_json(self):

        return json.dumps(self.to_dict())


class Token:

    def __init__(self, token: str, span: List[int],
                 syllables: List[Syllable] = None, clitic: str = None,
                 accented: str = None,
                 lemma_to_morphtags: Dict[str, Set[str]] = None):
        if len(token) != span[1]-span[0]:
            raise ValueError('Length of token {} does not match span {}.'
                             .format(token, span))
        else:
            self.text = token
            self.span = span
            self.syllables = syllables or list()
            self.clitic = clitic

            self.accented = accented
            self.lemma_to_morphtags = lemma_to_morphtags

    @classmethod
    def from_json(cls, json_file):
        raw = check_format(json_file)

        # self is undefined
        text = raw['token']
        span = raw['span']

        token = cls(text, span)

        if 'clitic' in raw:
            token.clitic = raw['clitic']

        if 'syllables' in raw:
            for syllable in raw['syllables']:
                token.syllables.append(Syllable.from_json(syllable))

        return token

    def to_dict(self):

        features = dict()

        features.update({'token': self.text})
        features.update({'span': self.span})
        features.update({'clitic': self.clitic})

        if self.syllables:
            features.update({'syllables': [syllable.to_dict() for syllable in self.syllables]})

        return minimal(features)

    def to_json(self):
        return json.dumps(self.to_dict())

    def is_punct(self):
        return bool(re.match('^[\W_]+$', self.text))

    def __str__(self):
        return self.text

    def __repr__(self):
        return ('Token(token={}, span={}, syllables={})'
                .format(self.text, self.span, self.syllables))


class Reading:

    def __init__(self, tokens: List[Token] = None, phenomena: dict = None):
        self.tokens = tokens or list()
        self.phenomena = phenomena or dict()

    @classmethod
    def from_json(cls, json_file):
        raw = check_format(json_file)

        tokens = list()

        for token in raw["tokens"]:
            # self is undefined
            tokens.append(Token.from_json(token))

        reading = cls(tokens)

        if 'phenomena' in raw:
            for phenomenon in raw['phenomena'].items():
                key, value = phenomenon
                for v in value:
                    if key in reading.phenomena:
                        reading.phenomena[key].append(MultisyllablePhenomenon.from_json(v))
                    else:
                        reading.phenomena[key] = [MultisyllablePhenomenon.from_json(v)]

        return reading

    def to_dict(self):

        features = dict()

        features.update({'tokens': [token.to_dict() for token in self.tokens]})

        phenomena = {key:[minimal(v.to_dict()) for v in value] for key,value in self.phenomena.items()}
        features.update({'phenomena': phenomena})

        return minimal(features)


    def to_json(self):

        return json.dumps(self.to_dict())

    def __len__(self):
        return len(self.tokens)

    def append_token(self, token: Token):
        self.tokens.append(token)

    def __str__(self):
        forms = [
            t.accented if t.accented is not None else t.text
            for t in self.tokens
        ]
        return ' '.join(forms)

    def __repr__(self):
        # TODO: Implement this properly.
        return str(self)


class Verse:

    def __init__(self, verse: str, source: dict = None,
                 readings: List[Reading] = None):
        self.text = verse
        self.source = source
        self.readings = readings or list()

    @classmethod
    def from_plain_verse(cls, plain_verse):
        verse = cls(plain_verse)
        # TODO: Generate readings.
        pass
        return verse

    @classmethod
    def from_json(cls, json_file):
        raw = check_format(json_file)

        text = raw['verse']
        source = dict()
        source['author'] = raw['source']['author']
        source['work'] = raw['source']['work']
        source['place'] = raw['source']['place']
        verse = cls(text, source=source)

        for reading in raw['readings']:
            verse.readings.append(Reading.from_json(reading))

        return verse

    def to_dict(self):    

        features = dict()

        features.update({'verse':self.text})
        features.update({'source':self.source})

        features.update({'readings': [reading.to_dict() for reading in self.readings]})

        return minimal(features)


    def to_json(self):

        return json.dumps(self.to_dict())

    def __str__(self):
        s = 'Verse: {verse}\n{reading_num} Readings:\n{readings}'
        readings_str = '\n'.join(str(r) for r in self.readings)
        return s.format(verse=self.text, reading_num=len(self.readings),
                        readings=readings_str)