User:Decimus Schomer/Scripts/Grammar analyser

From The SchomEmunity Wiki
Jump to: navigation, search

Decimus' user page | Decimus' talk page | Decimus' scripts | Decimus' script libraries | Decimus' projects
Main scripts page | Toggling Rotate script | UUID-getter scripts | Texture changer | Channel spier | Chatbot | Jump slab | Emailer | Fractal viewer | Grammar analyser | SPD viewer


About

This script, written by Decimus, is a basic grammatical analyser - it analyses the structure of anything said to it.

If someone could add a smiley filter to it, that would be appreciated :D

The Python version

This version asks you for text on a command line. To exit the program, and to produce a file called analyse.dump containing information on what it has extracted from what you said, type '_quit' at the prompt.

# Grammar analyser - analyse the grammatical composition of sentences
# Copyright (C) 2007 Decimus Schomer
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

# (you can find version 2 of the GPL at http://www.gnu.org/licenses/old-licenses/gpl-2.0.html)

# Data storage
groups = []
grammars = []

# Get the group (index in the groups list) which contains word
#  If word is not in any group, make a new one to contain it
def get_group(word):
    for i in range(len(groups)):
        if word in groups[i]:
            return i
    n = len(groups)
    groups.append([word])
    return n

# Merge the groups i1 and i2 into a single group
def _merge_groups(i1, i2):
    global grammars
    # Ensure that i2 is the larger number
    if i1 > i2:
        i1, i2 = i2, i1
    # Add all the items in the second group to the items in the first
    groups[i1] += groups[i2]
    # Delete the second group
    del groups[i2]
    # Ensure that each grammar is correct now the merger has been done
    for x in range(len(grammars)):
        g = grammars[x]
        for y in range(len(g)):
            # Replace all references to the group that was deleted
            #  with references to the merged group
            if g[y] == i2:
                g[y] = i1
            # Decrement all references to groups after the deleted one by one.
            #  This is due to the deletion of the second group
            elif g[y] > i2:
                g[y] -= 1
    # Delete all the duplicate grammars
    gcopy = set()
    for g in grammars:
        gcopy.add(tuple(g))
    grammars = []
    for g in gcopy:
        grammars.append(list(g))

# Merge any groups with identical or near-identical usage
def group_merge():
    num_grams = len(grammars)
    diffs = []
    # Run through each discrete pair of grammars
    for x in range(num_grams-1):
        gx = grammars[x]
        for y in range(x + 1, num_grams):
            gy = grammars[y]
            # Only work with pairs of same-length grammars
            if len(gx) == len(gy):
                f = 0
                idx = 0
                # Run through each part of the grammars,
                #  looking for a *single* difference
                # Multiple differences are discarded
                for z in range(len(gx)):
                    if gx[z] != gy[z]:
                        if f:
                            break
                        f = 1
                        idx = z
                # If we found a single difference:
                #  * if a similar difference has already been found,
                #     increment its count
                #  * if no similar differences have been found, add a new entry
                else:
                    if f:
                        for df in diffs:
                            if (df[0] == gx[idx]) and (df[1] == gy[idx]):
                                df[2] += 1
                                break
                        else:
                            diffs.append([gx[idx], gy[idx], 1])
    # Run through each difference with a count of at least 5
    #  and merge the groups in it
    done = []
    for g1, g2, n in diffs:
        if n >= 5:
            # Correct the entries - groups are deleted at each iteration!
            for i1, i2 in done:
                if g1 == i1:
                    g1 = i2
                elif g1 > i1:
                    g1 -= 1
                if g2 == i1:
                    g2 = i2
                elif g2 > i1:
                    g2 -= 1
            # Perform the merger
            _merge_groups(g1, g2)
            # Add the group pair to the done list
            done.append((g2, g1))

# Main loop
while 1:
    s = raw_input("Enter sentence(s): ").strip()
    # Take the lowercase version of the string and remove unimportant punctuation
    st = s.lower()
    st = "".join(st.split(","))
    # Commands
    if st == "_quit":
        break
    # Split the string into sentences
    l1 = st.split(".")
    if l1[-1] == "":
        l1 = l1[:-1]
    l2 = []
    l3 = []
    for itm in l1:
        l = itm.split("!")
        if l[-1] == "":
            l = l[:-1]
        l2 += l
    for itm in l2:
        l = itm.split("?")
        if l[-1] == "":
            l = l[:-1]
        l3 += l
    # Handle each sentence separately
    for sntnc in l3:
        wo = []
        # Process each word
        words = sntnc.split()
        for word in words:
            # Append the group which the word is in to the 'wo' list
            wo.append(get_group(word))
        # Run through each grammar, checking if it applies to the sentence
        for g in grammars:
            if len(g) == len(wo):
                for i in range(len(wo)):
                    if wo[i] != g[i]:
                        break
        # If the sentence has a new grammar, add the new grammar to the grammar list
        else:
            grammars.append(wo)
        # Perform any possible group merging
        group_merge()

# Dump the group and grammar lists in a human-readable(-ish) form
f = open("analyse.dump", "w")
f.write("Groups:\n=======\n\n")
for i in range(len(groups)):
    f.write(("%2d: " % i) + ", ".join(word for word in groups[i]) + "\n")
f.write("\nGrammars:\n=========\n\n")
for i in range(len(grammars)):
    f.write(("%2d: " % i) + " ".join(("%2d" % n) for n in grammars[i]) + "\n")
f.close()

The LSL version

This version analyses any text said near it.

NOTE: THIS SHOULD WORK, BUT AS OF YET IT IS UNTESTED (AND, AS OF THE LATEST TEST, THE MERGER FUNCTION DOESN'T WORK)

It doesn't seem to. - Katharine Berry 20:57, 18 July 2007 (BST)
It seems that the problem is in set_add_int_list(). I'm going to have a look at what it is specifically now.
Found the problem - you forgot to copy the mda_val lists back. I've fixed it here.
Thanks. (appropriately enough, I just came to do exactly that - I also noticed that :p) --Decimus
// Grammar analyser - analyse the grammatical composition of sentences
// Copyright (C) 2007 Decimus Schomer
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

// (you can find version 2 of the GPL at http://www.gnu.org/licenses/old-licenses/gpl-2.0.html)

//////////////////////////////////////////////////////////////////////
// Multi-dimensional array library

list mda_val_lens;
list mda_val;

string mda_get_string(integer x, integer y)
{
    integer idx = llRound(llListStatistics(LIST_STAT_SUM,llList2List(mda_val_lens,0,y-1)));
    if (x >= llList2Integer(mda_val_lens, y))
        return "";
    return llList2String(mda_val, idx+x);
}

integer mda_get_integer(integer x, integer y)
{
    integer idx = llRound(llListStatistics(LIST_STAT_SUM,llList2List(mda_val_lens,0,y-1)));
    if (x >= llList2Integer(mda_val_lens, y))
        return -1;
    return llList2Integer(mda_val, idx+x);
}

integer mda_len(integer idx)
{
    if (idx == -1)
        return llGetListLength(mda_val_lens);
    return llList2Integer(mda_val_lens, idx);
}

mda_append_string(integer y, string val)
{
    integer idx = llRound(llListStatistics(LIST_STAT_SUM,llList2List(mda_val_lens,0,y)));
    mda_val = llListInsertList(mda_val, [val], idx);
    mda_val_lens = llListReplaceList(mda_val_lens, [llList2Integer(mda_val_lens, y) + 1], y, y);
}

mda_append_integer(integer y, integer val)
{
    integer idx = llRound(llListStatistics(LIST_STAT_SUM,llList2List(mda_val_lens,0,y)));
    mda_val = llListInsertList(mda_val, [val], idx);
    mda_val_lens = llListReplaceList(mda_val_lens, [llList2Integer(mda_val_lens, y) + 1], y, y);
}

mda_append_row_str(string s)
{
    mda_val_lens = (mda_val_lens=[])+mda_val_lens+[1];
    mda_val = (mda_val=[])+mda_val+[s];
}

//////////////////////////////////////////////////////////////////////
// Set library

set_add_list(list val)
{
    integer lenv = llGetListLength(val);
    integer x;
    integer y;
    list l;
    integer s = 0;
    integer d;
    integer mlen = mda_len(-1);
    integer f;
    for (y = 0; y < mlen; ++y)
    {
        d = llList2Integer(mda_val_lens, y);
        if (d == lenv)
        {
            f = 0;
            l = llList2List(mda_val, s, s+d);
            for (x = 0; x < d; ++x)
            {
                if (llList2String(l, x) != llList2String(val, x))
                {
                    f = 1;
                }
            }
            if (! f)
                return;
        }
        s += d;
    }
    mda_val += val;
    mda_val_lens += [lenv];
}

mda_to_set()
{
    integer len = mda_len(-1);
    list orig_lens = mda_val_lens;
    list orig = mda_val;
    mda_val_lens = [];
    mda_val = [];
    integer i;
    integer s;
    integer d;
    list l;
    for (i = 0; i < len; ++i)
    {
        d = llList2Integer(orig_lens, i);
        l = llList2List(orig, s, s+d);
        set_add_list(l);
        s += d;
    }
}

//////////////////////////////////////////////////////////////////////
// Main script

list group_lens = [];
list groups = [];

list grammar_lens = [];
list grammars = [];

integer get_group(string word)
{
    mda_val_lens = group_lens;
    mda_val = groups;
    integer x;
    integer y;
    for (y = 0; y < mda_len(-1); y++)
        for (x = 0; x < mda_len(y); x++)
            if (mda_get_string(x, y) == word)
                return y;
    llWhisper(0, "Adding new group (" + (string)y + ") containing '" + word + "'");
    mda_append_row_str(word);
    groups = mda_val;
    group_lens = mda_val_lens;
    return y;
}

list split_sentences(string s)
{
    list l = [];
    string st = "";
    integer i;
    for (i = 0; i < llStringLength(s); i++)
    {
        string c = llGetSubString(s, i, i);
        if (c == "." || c == "!" || c == "?")
        {
            if (st)
            {
                l += [st];
                st = "";
            }
        }
        else if ((c != ",") && (c != ";") && ((c != " ") || (st != "")))
            st += c;
    }
    if (st)
        l += [st];
    return l;
}

list split_words(string s)
{
    list l = [];
    string st = "";
    integer i;
    for (i = 0; i < llStringLength(s); i++)
    {
        string c = llGetSubString(s, i, i);
        if (c == " ")
        {
            if (st)
            {
                l += [st];
                st = "";
            }
        }
        else
            st += c;
    }
    if (st)
        l += [st];
    return l;
}

_merge_groups(integer g1, integer g2)
{
    mda_val = groups;
    mda_val_lens = group_lens;
    integer i;
    integer n;
    integer idx1 = 0;
    integer idx2;
    integer idx3;
    list l;
    if (g1 > g2)
    {
        integer tmp = g1;
        g1 = g2;
        g2 = tmp;
    }
    for (i = 0; i < g1+1; i++)
        idx1 += llList2Integer(mda_val_lens, i);
    idx2 = idx1;
    for (; i < g2; i++)
        idx2 += llList2Integer(mda_val_lens, i);
    idx3 = idx2 + llList2Integer(mda_val_lens, g2) - 1;
    mda_val = llList2List(mda_val, 0, idx1) +
        llList2List(mda_val, idx2, idx3) +
        llList2List(mda_val, idx1+1, idx2-1) +
        llList2List(mda_val, idx3+1, llGetListLength(mda_val));
    mda_val_lens = llList2List(mda_val_lens, 0, g1-1) +
        [llList2Integer(mda_val_lens, g1) +
            llList2Integer(mda_val_lens, g2)] +
        llList2List(mda_val_lens, g1+1, g2-1) +
        llList2List(mda_val_lens, g2+1, mda_len(-1));
    groups = mda_val;
    group_lens = mda_val_lens;
    mda_val = grammars;
    mda_val_lens = grammar_lens;
    idx1 = 0;
    idx2 = 0;
    for (i = 0; i < mda_len(-1); i++)
    {
        idx1 += llList2Integer(mda_val_lens, i);
        for (idx3 = idx2; idx3 < idx1; idx3++)
        {
            n = llList2Integer(mda_val, idx3);
            if (n == g2)
                llListReplaceList(mda_val, [g1], idx3, idx3);
            else if (n > g2)
                llListReplaceList(mda_val, [n-1], idx3, idx3);
        }
        idx2 = idx1;
    }
    mda_to_set();
    grammars = mda_val;
    grammar_lens = mda_val_lens;
}

group_merge()
{
    mda_val = grammars;
    mda_val_lens = grammar_lens;
    list diffs = [];
    list done = [];
    integer num_grams = mda_len(-1);
    integer x;
    integer idxx = 0;
    integer y;
    integer idxy;
    integer idxz;
    integer tmp1;
    integer tmp2;
    integer f;
    integer idx;
    list lx;
    list ly;
    for (x = 0; x < num_grams-1; x++)
    {
        tmp1 = llList2Integer(mda_val_lens, x);
        idxy = idxx + tmp1;
        lx = llList2List(mda_val, idxx, idxy-1);
        for (y = x + 1; y < num_grams; y++)
        {
            tmp2 = llList2Integer(mda_val_lens, y);
            if (tmp1 == tmp2)
            {
                ly = llList2List(mda_val, idxy, idxy+tmp2-1);
                f = 0;
                idx = 0;
                for (idxz = 0; (idxz < tmp1) && (f != 2); idxz++)
                    if (llList2Integer(lx, idxz) != llList2Integer(ly, idxz))
                    {
                        if (f)
                            f = 2;
                        else
                        {
                            f = 1;
                            idx = idxz;
                        }
                    }
                if (f == 1)
                {
                    f = 0;
                    for (idxz = 0; (idxz < (llGetListLength(diffs) / 3)) && (! f); idxz++)
                    {
                        if ((llList2Integer(diffs, 3*idx) == llList2Integer(lx, idx)) &&
                            (llList2Integer(diffs, (3*idx)+1) == llList2Integer(ly, idx)))
                        {
                            llListReplaceList(diffs, [llList2Integer(diffs, 3*(idx)+2)+1], 3*(idx)+2, 3*(idx)+2);
                            f = 1;
                        }
                    }
                    if (! f)
                        diffs += [llList2Integer(lx, idx), llList2Integer(ly, idx), 1];
                }
            }
            idxy += tmp2;
        }
        idxx = idxy;
    }
    for (idx = 0; idx < (llGetListLength(diffs) / 3); idx++)
    {
        idxz = llList2Integer(diffs, (3*idx)+2);
        if (idxz >= 5)
        {
            idxx = llList2Integer(diffs, 3*idx);
            idxy = llList2Integer(diffs, (3*idx)+1);
            for (x = 0; x < (llGetListLength(done)/2); x++)
            {
                tmp1 = llList2Integer(done, 2*x);
                tmp2 = llList2Integer(done, (2*x)+1);
                if (idxx > tmp1)
                    idxx -= 1;
                else if (idxx == tmp1)
                    idxx = tmp2;
                if (idxy > tmp1)
                    idxy -= 1;
                else if (idxy == tmp1)
                    idxy = tmp2;
            }
            _merge_groups(idxx, idxy);
            done += [idxy, idxx];
        }
    }
}

default
{
    state_entry()
    {
        llListen(2, "", "", "");
        llListen(1, "", "", "LIST");
        llListen(1, "", "", "MERGE");
    }

    listen(integer channel, string name, key id, string msg)
    {
        if (channel == 1)
        {
            if (msg == "LIST")
            {
                mda_val = groups;
                mda_val_lens = group_lens;
                integer i;
                integer s = 0;
                integer e = 0;
                llSay(0, "Groups:");
                for (i = 0; i < mda_len(-1); i++)
                {
                    e += llList2Integer(mda_val_lens, i);
                    llSay(0, "(" + (string)i + ") " + llList2CSV(llList2List(mda_val, s, e-1)));
                    s = e;
                }
                mda_val = grammars;
                mda_val_lens = grammar_lens;
                s = 0;
                e = 0;
                llSay(0, "Grammars:");
                for (i = 0; i < mda_len(-1); i++)
                {
                    e += llList2Integer(mda_val_lens, i);
                    llSay(0, "(" + (string)i + ") " + llList2CSV(llList2List(mda_val, s, e-1)));
                    s = e;
                }
            }
            else if (msg == "MERGE")
                group_merge();
            return;
        }
        list lx;
        list ly = split_sentences(llToLower(msg));
        list wo;
        integer x;
        integer y;
        for (y = 0; y < llGetListLength(ly); y++)
        {
            wo = [];
            lx = split_words(llList2String(ly, y));
            for (x = 0; x < llGetListLength(lx); x++)
                wo += [get_group(llList2String(lx, x))];
            mda_val_lens = grammar_lens;
            mda_val = grammars;
            set_add_list(wo);
            grammar_lens = mda_val_lens;
            grammars = mda_val;
            
        }
    }
}