resource_package = __name__

import string
import re
import collections
import math
import pandas as pd
import json
import xml.dom.minidom as minidom
import xml.etree.ElementTree as et
from xml.etree.ElementTree import ElementTree
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from itertools import count
try:
    from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
    try:
        from itertools import izip as zip # < 2.5 or 3.x
    except ImportError:
        pass

##############Show Dataframe########################
def show_dataframe(parse_data):
    data = parse_data.getroot() 
    df_cols = ["DOCNO", "SONG", "ARTIST", "LYRICS"]
    rows = []

    for node in data: 
        s_docno = node.find("DOCNO").text if node is not None else None
        s_song = node.find("SONG").text if node is not None else None
        s_artist = node.find("ARTIST").text if node is not None else None
        s_lyrics = node.find("LYRICS").text if node is not None else None
        
        rows.append({"DOCNO": s_docno, "SONG": s_song, "ARTIST": s_artist, "LYRICS": s_lyrics})
    
    DataFrame = pd.DataFrame(rows, columns = df_cols)
    
    dictionary = DataFrame.set_index('DOCNO').T.to_dict('list')
    
    nilai    = list(dictionary.values())
    nomornya = list(dictionary.keys())
    for i in range(0, len(nomornya)): 
        nomornya[i] = int(nomornya[i]) 
    lagunya  = [sublist[0] for sublist in nilai]
    #artisnya = [sublist[1] for sublist in nilai]
    liriknya = [sublist[2] for sublist in nilai]
    
    res = {} 
    for key in lagunya: 
        for value in liriknya: 
            res[key] = value 
            liriknya.remove(value) 
            break 
    
    return res

##############N_DOC########################
def data_var(tree):
    
    tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")

    all_doc_no = []
    all_song = []
    all_lyrics = []

    for node in tree.iter("DOCNO"):
        all_doc_no.append(node.text)
        
    for node in tree.iter("SONG"):
        all_song.append(node.text)
        
    for node in tree.iter("LYRICS"):
        all_lyrics.append(node.text)

    N_DOC = len(all_lyrics)
    
    all_sentence_doc = []
    for i in range(N_DOC):
        all_sentence_doc.append(all_song[i] + all_lyrics[i])
        
    return all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc
    
    
##############Remove Punctuation###################
def remove_punc_tokenize(sentence):
    
    tokens = []
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation," ")
    
    sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
    for w in CountVectorizer().build_tokenizer()(sentence):
        tokens.append(w)
    return tokens


##############Case Folding########################
def to_lower(tokens):
    tokens = [x.lower() for x in tokens]
    return tokens
    
##############Load Data########################
def load_data(dcmnt_xml):

    all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
    all_profile = dcmnt_xml.getElementsByTagName('SONG')
    all_date = dcmnt_xml.getElementsByTagName('ARTIST')
    all_text = dcmnt_xml.getElementsByTagName('LYRICS')
    all_pub = dcmnt_xml.getElementsByTagName('PUB')
    all_page = dcmnt_xml.getElementsByTagName('PAGE')

    N_DOC = len(all_doc_no)
    
    all_sentence_doc_sample = []
    for i in range(N_DOC):
        sentence_doc_sample = ' '+ all_text[i].firstChild.data
        all_sentence_doc_sample.append(sentence_doc_sample)
        
    return all_doc_no, N_DOC, all_sentence_doc_sample


##############Indexing########################
def indexing(N_DOC, tokens_doc, all_doc_no):

    all_tokens = []
    for i in range(N_DOC):
        for w in tokens_doc[i]:
            all_tokens.append(w)

    new_sentence = ' '.join([w for w in all_tokens])

    for w in CountVectorizer().build_tokenizer()(new_sentence):
        all_tokens.append(w)
    
    all_tokens = set(all_tokens)
    
    proximity_index = {}
    for token in all_tokens:
        dict_doc_position = {}
        for n in range(N_DOC):
            if(token in tokens_doc[n]):
                dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
        proximity_index[token] = dict_doc_position
    
    proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
    indexnya = json.loads(json.dumps(proximity_index))
    words = indexnya.keys()
    freq = indexnya.values()
    freq = list(freq)
                    
    hasil = {}
    for key in words: 
        for value in freq: 
            hasil[key] = value
            freq.remove(value)
            break
        
    numb = []
    idx = []

    for i, j in hasil.items():
        numb.append(i)
        idx.append(j)
        
    res = {} 
    for key in numb: 
        for value in idx: 
            res[key] = value 
            idx.remove(value) 
            break
    
    return res



from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stop_word_token(tokens):
    tokens = [w for w in tokens if not w in stop_words]
    return tokens
    
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stemming(tokens):
    for i in range(0, len(tokens)):
        if (tokens[i] != stemmer.stem(tokens[i])):
            tokens[i] = stemmer.stem(tokens[i])
    return tokens
    

def proximity(dcmnt_xml, query):

    all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
    all_song = dcmnt_xml.getElementsByTagName('SONG')
    all_lyrics = dcmnt_xml.getElementsByTagName('LYRICS')
    
    N_DOC = len(all_doc_no)
    
    all_sentence_doc = []
    for i in range(N_DOC):
        sentence_doc = all_song[i].firstChild.data +' '+ all_lyrics[i].firstChild.data
        all_sentence_doc.append(sentence_doc)
    
    tokens_doc = []
    for i in range(N_DOC):
        tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))
    
    for i in range(N_DOC):
        tokens_doc[i] = to_lower(tokens_doc[i])
    
    for i in range(N_DOC):
        tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
        
    all_tokens =[]
    for i in range(N_DOC):
        for j in tokens_doc[i]:
            all_tokens.append(j)
            
    new_sentences = ' '.join([w for w in all_tokens])

    for j in CountVectorizer().build_tokenizer()(new_sentences):
        all_tokens.append(j)
    
    all_tokens = set(all_tokens)
    
    proximity_index = {}
    for token in all_tokens:
        dict_doc_position = {}
        for n in range(N_DOC):
            if(token in tokens_doc[n]):
                dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
        proximity_index[token] = dict_doc_position
    
    import collections
    proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
    
    kunci = []
    nilai = []
    for key, value in proximity_index[query].items():
        kunci.append(key)
        nilai.append(value)
    
    dict = {} 
    for key in kunci: 
        for value in nilai: 
            dict[key] = value 
            nilai.remove(value) 
            break
            
        xtree = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
        xroot = xtree.getroot() 

        df_cols = ["SONG"]
        rows = []

        for node in xroot: 
            lirik = node.find("SONG").text if node is not None else None
            
            rows.append({"SONG": lirik})

        df = pd.DataFrame(rows, columns = df_cols)
        
        nomor = []
        for i in dict:
            nomor.append(int(i))
        
        judul = []
        for i in nomor:
            judul.append(df['SONG'][i-1])
        
        hasil = {}
        for key in nomor: 
            for value in judul: 
                hasil[key] = value 
                judul.remove(value) 
                break
        
        numb = []
        tit = []

        for i, j in hasil.items():
            numb.append(i)
            tit.append(j)
        
        res = {} 
        for key in numb: 
            for value in tit: 
                res[key] = value 
                tit.remove(value) 
                break
        
    return res

def detail(id):
    
    import pandas as pd 
    import xml.etree.ElementTree as et 
    import numpy as np

    xtree = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
    xroot = xtree.getroot() 

    df_cols = ["SONG", "LYRICS"]
    rows = []

    for node in xroot: 
        judul = node.find("SONG").text if node is not None else None
        lirik = node.find("LYRICS").text if node is not None else None
        
        rows.append({"SONG": judul,
                     "LYRICS":lirik})

    df = pd.DataFrame(rows, columns = df_cols)
    
    lyrics = df['LYRICS'][id-1]
    judul = df['SONG'][id-1]
            
    return lyrics ,judul

def phrase(dcmnt_xml, query):

    all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
    all_song = dcmnt_xml.getElementsByTagName('SONG')
    all_lyrics = dcmnt_xml.getElementsByTagName('LYRICS')
    
    N_DOC = len(all_doc_no)
    
    all_sentence_doc = []
    for i in range(N_DOC):
        sentence_doc = all_song[i].firstChild.data +' '+ all_lyrics[i].firstChild.data
        all_sentence_doc.append(sentence_doc)
    
    tokens_doc = []
    for i in range(N_DOC):
        tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))
    
    for i in range(N_DOC):
        tokens_doc[i] = to_lower(tokens_doc[i])
    
    for i in range(N_DOC):
        tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
        
    all_tokens =[]
    for i in range(N_DOC):
        for j in tokens_doc[i]:
            all_tokens.append(j)
            
    new_sentences = ' '.join([w for w in all_tokens])

    for j in CountVectorizer().build_tokenizer()(new_sentences):
        all_tokens.append(j)
    
    all_tokens = set(all_tokens)
    
    ##Phrase Search##
    bi_gram_tokens = []
    bi_gram_sentence_doc = []

    for n in range(N_DOC):
        token = []
        for i in range(len(tokens_doc[n])):
            if not(i == len(tokens_doc[n])-1):            
                token.append(tokens_doc[n][i]+'_'+tokens_doc[n][i+1])
                bi_gram_tokens.append(tokens_doc[n][i]+'_'+tokens_doc[n][i+1])
        bi_gram_sentence_doc.append(' '.join(token))
    
    bi_gram_index = {}
    for bigram_token in bi_gram_tokens:
        doc_no = []
        for i in range(N_DOC):
            if(bigram_token in bi_gram_sentence_doc[i]):
                doc_no.append(all_doc_no[i].firstChild.data)
        bi_gram_index[bigram_token] = doc_no
        
    lst_doc = bi_gram_index[query]
    for i in range(len(lst_doc)):
        lst_doc[i] = int(lst_doc[i])
    
    for i in range(len(lst_doc)):
        lst_doc[i] = int(lst_doc[i])
    
        xtree = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
        xroot = xtree.getroot() 

        df_cols = ["SONG"]
        rows = []

        for node in xroot: 
            lirik = node.find("SONG").text if node is not None else None
                
            rows.append({"SONG": lirik})

        df = pd.DataFrame(rows, columns = df_cols)
        
        judul = []
        for i in lst_doc:
            judul.append(df['SONG'][i-1])
                
        hasil = {}
        for key in lst_doc: 
            for value in judul: 
                hasil[key] = value 
                judul.remove(value) 
                break
                
        numb = []
        tit = []

        for i, j in hasil.items():
            numb.append(i)
            tit.append(j)
        
        res = {} 
        for key in numb: 
            for value in tit: 
                res[key] = value 
                tit.remove(value) 
                break
        
    return res