25 Years of Programming
An open source source for C, C++, OWL, BASIC, MDB, XLS, DOT, and more...
Home   Projects   Up   Sitemap   Search   Blog   Forum+Chat   About Us   Privacy   Terms of Use   Feedback   FAQ   Images   Services   Ads   Donate   Humor

Fact.cpp and related classes for natural language processing

This module of the WTalk.cpp natural language processing (NLP) chatterbot project parses the user's input and stores the extracted information.

A FACT contains an array of tokens extracted from a user sentence, grammatically parsed with words and phrases marked by grammatical type and by the type of information they contain (who, what, when, where, why).

fact.h

/*	fact.h	        	9-18-01
	This is part of the WTalk project.
	Copyright (C)1993-2001 Steven Whitney.
	Published under GNU GPL (General Public License) Version 2, with ABSOLUTELY NO WARRANTY.
	Initially published by http://25yearsofprogramming.com.

*/
#ifndef __FACT_H
#define __FACT_H

#include <classlib\arrays.h>
#include "c:\bcs\my.h"
#pragma hdrstop

#include "dic.h"
#include "c:\bcs\library\wordlist.h"
#include "c:\bcs\library\textsub.h"

/////////////////////////////////////////////////////////////////////////////
// A FACT contains an array of tokens extracted from a user sentence,
// grammatically parsed with phrases marked by grammatical type and
// by the type of information they contain (who,what,when,where,why,etc).
// It should be expanded as necessary to make manipulation of tokens in a FACT
// as easy as the manipulation of chars within a string.
class FACT
{
public:
	FACT(string s = "", BOOL parsenow = TRUE);
	FACT(const FACT& other, int startpos = 0, int count = MAXINT);
	FACT(const FACT& other, TOK::infotypes itype);
	~FACT();

	// #error write operator =

	// comparison functions
	BOOL operator == (const FACT& other) const;	// uses (&other == this)
	BOOL SameAs(const FACT& other) const;		// alternative ==, compares members
	double Similarity(const FACT& other) const; // measure of approximate match

	// input/output functions
	friend istream& operator >> (istream& is, FACT&);		  	// read
	friend ostream& operator << (ostream& os, const FACT&);  	// write
	string ShowToks() const;

	// array functions
	uint GetItemsInContainer() const { return(toks.GetItemsInContainer()); }
	BOOL IsIndex(uint pos) const { return(pos < toks.GetItemsInContainer()); }

	// editing and manipulation functions:
	BOOL Contains(const FACT& other, BOOL insequence, BOOL inorder) const;
	int Count(const string& word) const;			// how many times is this word in toks[]?
	int Destroy(int loc);							// destroy token at loc
	int Destroy(const string& s, int startpos = 0);	// destroy 1st s at or after startpos
	int Destroy(const FACT& other);					// destroy matching toks
	int Find(const string& word,int startpos = 0) const;// index of word, at or after pos,
														// or INT_MAX if not found
	int Find(const FACT& other,int startpos = 0) const;	// wholly-contained, at or after pos,
														// or INT_MAX if not found
	void Insert(const FACT& other, int loc = 0);		// default is to prepend
	void operator += (const FACT& other);
	void Remove(int cutpoint);

	// other functions
	BOOL IsGood() const;
	int Phrase(int startpos, int ptype, BOOL fitonly, BOOL markwt, BOOL markpt);
	int AnyPhrase(int startpos, BOOL markwt = FALSE, BOOL markpt = FALSE);
	int MarkPhrase(int pos, int count, TOK::phrasetypes ptype, BOOL overwrite = TRUE);
	int MarkInfotypes(int pos, int count, int itype, BOOL overwrite = TRUE);
	int DDEParse();							// Parse using DDE
	string ReversedContext();
	int Tokenize();
	const string& RebuildSentence();		// recreate Sentence from toks
	BOOL UploadToTOKSTable();				// send toks to MDB table

	// variables
	TArrayAsVector<TOK> toks;	// sentence tokens.  Do NOT derive FACT from TArray<TOK>!
	string Sentence;			// sentence in text form.
	int flag;					// can use as BOOL (flagged) or as int (for ranking)
	enum sentypes		// types of sentences
	{					// DO NOT REORDER. THERE ARE RULES THAT REFER TO THESE BY NUMBER.
		UNKNOWN,        // 0
		STATEMENT,      // 1. normal input
		QUESTION,		// 2. general question for pgm to answer (parse might change the
						//    type to one of the more specific Q types below).
		ANSWERYES,		// 3. a YES answer, in effect
		ANSWERNO,		// 4. a NO answer, in effect
		ANSWERIND,		// 5. indeterminate (you only know it was intended as an answer)
		DESCRIPTION,	// 6. sentence uses is-word(s), maybe for updating word definition
		COMMAND,		// 7. command to pgm: now not made into FACTs, but might in the future.
		EXCLAMATION,	// 8. use as emotion indicator
		PARTIAL			// 9. if terminated by a comma or otherwise incomplete (a phrase).
	} sentype;			// type of sentence: statement, question, etc.

							// For a question, determining what is being asked about is
							// beyond the ability of a Rule, but it's not too difficult,
							// and the natural place to do it is in parse().
							// The question-answering routine (now answerq) can use qtype
							// to determine what kind of query it is.
							// Most of these types are based on the infotypes.
	enum qtypes				// types of questions
	{						// DO NOT REORDER. THERE ARE RULES THAT REFER TO THESE BY NUMBER.
		QADVICE,			// a "real" question: user asking for advice, etc.
							// it is also for "unknown" (initial state).  The point is that
							// a QADVICE question is not handled by special lookup routines.
							// It's just ordinary user input.
							// the rest are for quiz-type questions involving fact
							// array searches
		QMISC,     			// (not used, here for completeness)
		QWHO,               // a question about WHO did something,
		QDIDWHAT,           // etc...
		QTOWHOM,            //
		QWHEN,              //
		QWHERE,             //
		QWHY,               //
		QHOW,               //
		QCOND,				//
		QYESNO,				// a yes/no question
		QLOOKUP				// requires dictionary research
	} qtype;				// type of question

	enum tenses
	{
		NONE,				// missing or unknown
		PAST,
		PRESENT,
		FUTURE
	} tense;				// the tense of the primary verb phrase

	// static members
	static TextSubArray Contractions;
	static TextSubArray ContextReverser;

	// lists of words that almost automatically flag what a phrase's purpose is
	static WordList iswords;	// verbs indicating state or change of state
								// (usu. followed by adj or noun phrase,
								// not dir. or indir. obj.)
	static WordList InfiniVerbs;// verbs often followed by an infinitive
	static WordList Pronouns;	// list of pronouns

	static WordList YNQStarters;// words that commonly begin Yes/No questions

	// static functions
	static string sentypetoalpha(sentypes s);
	static string qtypetoalpha(qtypes q);
	static string tensetoalpha(tenses t);

	static SDDEHandler* Dde;	// a copy of the app's handler, for local use

protected:
};
//----------------------------------------------------------------------------
// 						end class FACT
//////////////////////////////////////////////////////////////////////////////
// collection of FACTS with related operations
// leave indirect to avoid time consuming copying when adding elements
// making it direct may also require redefining FACT::operator ==
class FactMemory : public TIArrayAsVector<FACT>
{
public:
	FactMemory();
	~FactMemory();

	uint AddFacts(const string& s);
	string ShowFacts(BOOL flaggedonly = TRUE);
	string ShowLast();
	string answerq();

	// const FACT* lastfact();				// or zero if none
	const FACT* FindBestMatch(const FACT*);

protected:
};
//----------------------------------------------------------------------------
// 						end class FactMemory
//////////////////////////////////////////////////////////////////////////////

#endif			// fact.h

fact.cpp

/*   fact.cpp         	3/5/02
	This is part of the WTalk project.
	Copyright (C)1993-2002 Steven Whitney.
	Published under GNU GPL (General Public License) Version 2, with ABSOLUTELY NO WARRANTY.
	Initially published by http://25yearsofprogramming.com.

------
To Do:

see if it's practical to use flag parsenow to ALSO prevent the new TOKs from looking up their
MDB info, to speed up construction of temporary FACTs that are used for nothing but
token searches.
Potential problems:
SameAs
	uses TOK operator ==, which uses AlphaID and typenow.

------
Notes:

----------------------
Keep this discussion, even though this is not exactly how I wound up doing it.
When user asks a question, you must search for the FACT that best matches what is
being asked about.  Method must be very flexible because of word variants AND
word order variations in sentence construction.  That is, you're unlikely to
find the exact phrases in the user's question, AND the words themselves
may be variant forms in the facts being searched.  Solution:
Both TOK and FACT have more than one way that a match can be defined, ranging
from a strict == to barely similar.  If a stringent definition fails to
produce a match, drop to the next lower level.  To search for a matching fact:
for( strictest rule for FACT comparison ; ; next, less strict rule)
	for( strictest rule for TOK comparison ; ; next, less strict rule)
		if(satisfactory match for your purpose)
			quit;
Example:
Jack and Jill ran up the hill in the rain to fetch a pail of water.
Sample questions that should be answerable:
Who ran in the rain?
Who went up the hill to fetch water?
Where did Jack and Jill run?
Who fetched?
What did Jack and Jill fetch?

----------------------
Another example, designed to be confusing for the way I did wind up doing the match tests:
Given:
The man ran to the store.
The man ran the store.
The man stored some bran.
The man who ran the store ran to the office.
Ask:
Who ran to the store?

----------------------

*/
#include "fact.h"
#pragma hdrstop

//////////////////////////////////////////////////////////////////////////////
// class FACT
//----------------------------------------------------------------------------
// Define static FACT members.

SDDEHandler* FACT::Dde = 0;

TextSubArray FACT::Contractions;   	// these are read from disk elsewhere
TextSubArray FACT::ContextReverser;

// lists of words that almost automatically flag what a phrase's purpose is
WordList FACT::iswords(15);		// verbs indicating state or change of state (usually
								// followed by adj or noun phrase, not dir. or indir. obj.)
WordList FACT::InfiniVerbs(50); // verbs often followed by infinitives
WordList FACT::Pronouns(20);	// pronouns
WordList FACT::YNQStarters(10); // words that commonly begin yes/no questions

//----------------------------------------------------------------------------
// normal constructor
FACT::FACT(string s, BOOL parsenow) : toks(5,0,5)
{
Sentence = s;
sentype = UNKNOWN;   	// start as unknown, Parse() resets, if it can.
qtype = QADVICE;
tense = PRESENT;
flag = 0;

Tokenize();  			// no harm if s is empty
if(parsenow)
	DDEParse();

}                			// constructor
//----------------------------------------------------------------------------
// this "copy" constructor copies count tokens from startpos
FACT::FACT(const FACT& other, int startpos, int count)
	: toks(other.GetItemsInContainer(),0,5)
{
sentype = PARTIAL;
qtype = other.qtype;		// ?
tense = other.tense;
flag = 0;

startpos = max(startpos,0);						// prevent negative indexes
for(int i = startpos ; ((i < other.GetItemsInContainer()) && (count > 0)) ; i++, count--)
	toks.Add(other.toks[i]);
RebuildSentence();
}                    		// "substring-FACT" ctor
//----------------------------------------------------------------------------
// this "copy" constructor copies all tokens that are marked as the specified infotype.
// the ability to do this makes it possible to easily extract just the subject,
// or verb, object, or where info, etc.  Makes a lot of misc fns unnecessary.
FACT::FACT(const FACT& other, TOK::infotypes itype) : toks(other.GetItemsInContainer(),0,5)
{
sentype = PARTIAL;
qtype = other.qtype;		// ?
tense = other.tense;
flag = 0;
for(int i = 0 ; i < other.GetItemsInContainer() ; i++)
	if(other.toks[i].InfoType == itype)
		toks.Add(other.toks[i]);   			// uses TOK copy ctor
RebuildSentence();
}                   		// infotype ctor
//---------------------------------------------------------------------------
FACT::~FACT()
{
}                   		// destructor
//---------------------------------------------------------------------------
// a FACT in the FactMemory must be exactly unique (itself only)
BOOL FACT::operator == (const FACT& other) const
{
return(&other == this);
}                        	//operator ==
//----------------------------------------------------------------------------
// like operator ==, but compares members
BOOL FACT::SameAs(const FACT& other) const
{
// return(Similarity(other) == 1.0);	// alternate (but slow) possibility

if(GetItemsInContainer() != other.GetItemsInContainer())
	return(FALSE);
for(int i = 0 ; i < GetItemsInContainer() ; i++)
	if(!(toks[i] == other.toks[i]))
		return(FALSE);
return(TRUE);
}							//SameAs
//---------------------------------------------------------------------------
// a measure of how similar two FACTs are to each other.
// two measures I thought of but didn't use were
// percent of perfectly matching TOKs found in order
// percent of perfectly matching TOKs found in any order
// I thought this function might give similar results for those situations, anyway,
// but am not sure.  Also, it is unclear how those measures, if given multipliers
// as for TOK, should be scaled to properly compare with the values returned
// by Similarity.  They're not particularly similar or compatible measures,
// and is a small # of perfect matches better than lots of similarity?
//
double FACT::Similarity(const FACT& other) const
{
if(this == &other)				// the same object
	return(1.0);

if(!GetItemsInContainer() && !other.GetItemsInContainer())	// 2 empties do match
	return(1);
if(!GetItemsInContainer() || !other.GetItemsInContainer())	// 1 empty can't match
	return(0);

// Construct a table, with all the TOKs from this FACT on one axis, other's TOKs on the other.
// the table intersections hold the Similarity measure for the intersecting TOKs.
// You wind up with an array with measures of how well every TOK in one FACT
// matches every TOK in the other.

int elements = GetItemsInContainer() * other.GetItemsInContainer();
TSArrayAsVector<TOKComparisonEntry> TOKTable(elements,0,1);

for(int i = 0 ; i < GetItemsInContainer() ; i++)
	for(int j = 0 ; j < other.GetItemsInContainer() ; j++)
		TOKTable.Add( TOKComparisonEntry( &(toks[i]), &(other.toks[j]) ) );

// all the best matches are at the start of TOKTable, which also means that the best
// match for each TOK also is encountered first.  Starting at the beginning,
// for each best match found, go through the REST of the array deleting
// all the other matches for BOTH of the paired TOKs.  That is, once 2 TOKs
// have been paired to each other (as a best match), neither TOK can be
// paired with any other, so delete all those subsequent (inferior) pairings.
// In table format, it means delete all the other squares in the row and in
// the column in which the intersecting square falls, leaving only the intersection.
//
// When finished, TOKTable should contain only as many elements as the shorter FACT
// has TOKs, each one paired with its best match from the longer FACT.

for(i = 0 ; i < TOKTable.GetItemsInContainer() ; i++)
	for(int j = i + 1 ; j < TOKTable.GetItemsInContainer() ; )
		if(TOKTable[j].RefersTo(TOKTable[i].a) || TOKTable[j].RefersTo(TOKTable[i].b))
			TOKTable.Destroy(j);
		else                      	// only move on to next one if you didn't destroy j
			j++;					// otherwise, j is now the next one anyway.

// build a composite score of how well the two FACTs match each other, by summing the
// scores of the individual TOKs and dividing by the count, for an average TOK match score.
double score = 0;
for(i = 0 ; i < TOKTable.GetItemsInContainer() ; i++)
{
	// elements with Similarity of 0 were left in above in case the number of them,
	// or the total number of remaining elements, is ever useful for scoring.
	// The reason I don't use a method multiplying the scores of the individual TOKS
	// to get a composite score is that a large *number* of partial TOK
	// matches would drag the score down; thus a shorter comparison FACT (other)
	// would be more likely to score well than a longer one in which there may
	// actually have been better matches, only more of them, which lowered the score.

	score += TOKTable[i].Similarity;
}
// different method here than in string similarity().
// using the smaller count (min) makes searches for short questions ("Who ran?") more
// likely to tie with scores of 1.0, allowing you to find *all* matches, not just one best.
// using the larger count (max) increases spread of scores, making ties less likely,
// and making a long fact score worse than a short fact against the same short question,
// even though they may both contain equally valid answers to the question.

score /= min(GetItemsInContainer(), other.GetItemsInContainer());
return(score);
}                    		// Similarity
//---------------------------------------------------------------------------
// append all the tokens from another fact
void FACT::operator += (const FACT& other)
{
for(int i = 0 ; i < other.GetItemsInContainer() ; i++)
	toks.Add(other.toks[i]);
RebuildSentence();
}
//---------------------------------------------------------------------------
// read should extract only up to .;?  (sentence terminators)
istream& operator >> (istream& is, FACT& )
{
logerror("Function operator >> (istream& is, FACT& ) does nothing.");
return(is);
}
//---------------------------------------------------------------------------
// write
ostream& operator << (ostream& os, const FACT& f)
{
for(int i = 0 ; i < f.GetItemsInContainer() ; i++)
	os << (string)(f.toks[i]) << " ";
return(os);
}
//---------------------------------------------------------------------------
// recreate Sentence from toks
const string& FACT::RebuildSentence()
{
Sentence.remove(0);
for(int i = 0 ; i < GetItemsInContainer() ; i++)
	Sentence += toks[i] + " ";
Sentence = Sentence.strip(string::Both);
return(Sentence);	
}                  			// RebuildSentence
//---------------------------------------------------------------------------
// remove all tokens at and after the given index location
void FACT::Remove(int cutpoint)
{
while(GetItemsInContainer() > cutpoint)
	toks.Destroy(cutpoint);
RebuildSentence();
}                      		// remove
//----------------------------------------------------------------------------
// insert another fact into this one, at the given loc
void FACT::Insert(const FACT& other, int loc)
{
loc = max(loc, 0);											// prevent negative
loc = min(loc, (int)GetItemsInContainer());  			// limit loc to array end (append)
for(int i = 0 ; i < other.GetItemsInContainer() ; i++)
	toks.AddAt(other.toks[i], loc);
RebuildSentence();
}                      		// Insert
//---------------------------------------------------------------------------
BOOL FACT::IsGood() const
{
	return(GetItemsInContainer() > 0);
}                        	//IsGood
//---------------------------------------------------------------------------
// extract all tokens from sentence and put them in toks container.
// returns number of tokens created.  If 0, the FACT is useless.
int FACT::Tokenize()
{
uint sp = 0;              					// index into Sentence: token search starting point
string token;                   			// temp string for alphanumeric token from sentence
while((sp = gettoken(Sentence,sp,token)) != 0)	// gettoken returns new place to start from
{
	// #error it might be best to only Add it if the TOK ctor successfully found
	// or created an AlphaID, else just omit the token.  unsure what problems can be
	// caused in other locs by an AlphaID of 0, but TOK ctor does sometimes fail:
	// most recently when wtalk was launched by winword, and DDEML or
	// Windows got confused or overwhelmed.
	toks.Add(TOK(token)); 	        		// add the word
}
return(GetItemsInContainer());
}                             // Tokenize
//---------------------------------------------------------------------------
// display complete info from sequential token nodes (sentence)
string FACT::ShowToks() const
{
ostrstream os;
os << "SentenceType  : " << sentypetoalpha(sentype) << endl;
os << "QuestionType  : " << qtypetoalpha(qtype) << endl;
os << "Tense         : " << tensetoalpha(tense) << endl;
os << "WORD         TYPE   PHRASE  INFO   " << endl;
os << "------------ ------ ------- -------" << endl;
for(int i = 0 ; i < GetItemsInContainer() ; i++)
{
	string wordtype = TOK::toalpha(toks[i].Type()); 	// translate types to alpha
	string phrasetype = TOK::phrasetoalpha(toks[i].PhraseType);
	string itype = TOK::infotoalpha(toks[i].InfoType);

	os << setw(12) << setiosflags(ios::left) << (string)toks[i];
	os << " " << setw(6) << wordtype;
	os << " " << setw(1) << (toks[i].PhraseType) << ":" << setw(5) << phrasetype;
	os << " " << setw(10) << itype << endl;
}
os << ends;
string s = os.str();
delete[] os.str();
return(s);
}                  			// ShowToks
//----------------------------------------------------------------------------
// how many times is this word in toks[]?
int FACT::Count(const string& s) const
{
int count = 0;
int tokcount = GetItemsInContainer();
for(int i = 0 ; i < tokcount ; i++)
	if((string)toks[i] == s)
		count++;
return(count);
}                      		// Count
//----------------------------------------------------------------------------
// get index of first occurrence of word in toks[], at or after pos.
// You can use this to find Nth occurrence; haven't figured out how.
// returns index or INT_MAX if not found (same as TArray::Find)
// note that this uses == test for the strings, not the more restrictive TOK ==.
int FACT::Find(const string& s, int pos) const	// a pos beyond end of fact is ok.
{
int tokcount = GetItemsInContainer();
for(int i = 0 ; i < tokcount ; i++)      // start at beginning so returned index is real
	if(i >= pos)						 // ones before pos don't count
		if((string)toks[i] == s)
			return(i);
return(INT_MAX);
}                      			// Find string
//----------------------------------------------------------------------------
// search for first occurrence of another FACT wholly contained in this one, at or after pos.
// returns index of token that starts the contained FACT, or INT_MAX if not found.
// note that this uses == test for the strings, not the more restrictive TOK ==.
int FACT::Find(const FACT& other, int pos) const
{
int thiscount = GetItemsInContainer();
int othercount = other.GetItemsInContainer();
if(!thiscount || !othercount || (othercount > thiscount))
	return(INT_MAX);

// first tok may occur more than once at or after pos, so must search from each.
while(1)
{
	// first tok found?  if entire search succeeds, index will be the return value
	int index = Find((string)(other.toks[0]), pos);
	if(index == INT_MAX)                   	// first token not found
		return(INT_MAX);
	for(int i = 1 ; i < othercount ; i++) 	// search for the rest of the toks
	{
		if((index + i) >= thiscount )  		// hit end of this: no more toks to search,
			return(INT_MAX);                // and no later start points can succeed, either.
		if((string)toks[index + i] != (string)other.toks[i])
			break;                       	// nonmatch: quit searching
	}
	if(i == othercount)	// we made it to end: exact match was found.
		return(index);
	pos = index + 1;	// start new search from token AFTER the previous Find
}
}            	   			// Find FACT
//----------------------------------------------------------------------------
// whether this FACT contains all the toks from other, meeting the specified conditions.
// if you add a capability for synonym lookup, this might be a starting point for
// a function that determines whether 2 FACTS contain essentially the same information.
BOOL FACT::Contains(const FACT& other, BOOL insequence, BOOL inorder) const
{
int thiscount = GetItemsInContainer();
int othercount = other.GetItemsInContainer();
if(!thiscount || !othercount || (othercount > thiscount))
	return(FALSE);
if(insequence)
	return(Find(other,0) != INT_MAX);
if(inorder)
{
	int pos = 0;
	// search for each tok at or after pos of previous
	for(int i = 0 ; i < othercount ; i++)
	{
		pos = Find((string)(other.toks[i]), pos);
		if(pos == INT_MAX)                // any token not found means failure
			return(FALSE);
		pos++;					// start new search from token AFTER the previous Find
	}
	return(TRUE);
}
// any order ok
// create copy of other, Destroy() all toks that occur in this, see if anything's left
return(FACT(other).Destroy(*this) == othercount);
}								//contains
//----------------------------------------------------------------------------
// destroy token at loc
// returns loc if the item was found and destroyed, else INT_MAX
int FACT::Destroy(int loc)
{
if((loc >= 0) && (loc < toks.GetItemsInContainer()))
{
	toks.Destroy(loc);
	return(loc);
}
return(INT_MAX);
}                           	//Destroy
//----------------------------------------------------------------------------
// destroy 1st occurrence of s after startpos
// returns index that the item HAD if it was found and destroyed, else INT_MAX
int FACT::Destroy(const string& s, int startpos)
{
int i;
if((i = Find(s,startpos)) != INT_MAX)
	return(Destroy(i));
return(INT_MAX);
}                           	//Destroy
//----------------------------------------------------------------------------
// destroy in this FACT the first occurrence of each token in other.
// returns number of toks destroyed.
int FACT::Destroy(const FACT& other)
{
int dcount = 0;
for(int i = 0 ; i < other.GetItemsInContainer() ; i++)
	if(Destroy((string)(other.toks[i]),0) != INT_MAX)
		dcount++;
return(dcount);
}                            	//Destroy
//----------------------------------------------------------------------------
// returns a string containing the entire sentence with context reversed
// because of reverse.dat, it fails to translate "you were" into "I was" (says: I were).
// but user more likely said "we were" anyway, which you don't want to translate to
// "you was".
string FACT::ReversedContext()
{
string result;
for(int j = 0 ; j < GetItemsInContainer() ; j++)	// test each word
{
	string token = toks[j];						// text of 1 word, might get changed
	int reversecount = ContextReverser.GetItemsInContainer();
	// can't you use ContextReverser.Find here?
	for(int k = 0 ; k < reversecount ; k++)		// compare each word in reverse array to token
		if(ContextReverser[k].Key == token) 	// if this word might need translation,
		{
			// some special cases, based on the previous word
			if(j > 0)
			{
				TOK prevtok = toks[j-1];					// previous token
				string previous = prevtok;				    // previous word

				// leave alone: he/she/it was, they are/were
				if( ((token == "are") && (previous != "you"))	||
					((token == "was") && (previous != "i")) 	||
					((token == "were") && (previous != "you")) )
						break;

				// "you" is a direct or indirect object, so translate
				if((token == "you") && (prevtok.TypeIs(PREP) || prevtok.TypeIs(VERB)) )
				{
					token = "me";
					break;
				}
			}
			// special cases didn't apply, so translate: copy translation into token
			token = ContextReverser[k].Sub;
			break;
		}
	result += token + " ";   // translated, or not
}
return(result);
}                 		// ReversedContext
//---------------------------------------------------------------------------
// this fn (and other related) is a replacement for the entire StateMachine class,
// but it won't work well until a lot of experience is encoded in MDB's LegalPhrases.
//
// Advantages of the method it uses (testing against all known-legal phrase constructions):
// 1. it can learn new legal constructions, which the StateMachines can't.
// 2. it doesn't modify tok.typenow while it's testing, so you can safely use it to
//    test for a phrase anywhere at any time without any side effects.
// Disadvantage is that the total number of possible phrase constructions is
// astronomical; fortunately, the number of *common* ones should be manageable.
//
// returns length (in tokens) of a phrase of the requested type, starting at startpos,
// or zero, meaning that that type of phrase cannot begin at that startpos.
//
// retrieves legal phrases in order of descending length, and returns the first found,
// to create the *longest* phrase possible.
// (May not be desirable.  The end result you really want is
// not individual phrases of maximum length, but to get
// (ideally) every tok in the sentence assigned to *some* legal phrase.
// But each call here only parses (knows about) 1 phrase, so getting the ideal result
// would require doing multiple parsings and choosing the one that assigned the
// maximum NUMBER of toks to legal phrases.)
//
// startpos = index within toks[] to begin testing from
// ptype = the type of phrase you are looking for
// fitonly = if TRUE, requires a phrase to FIT each word's current typenow, won't change it
// 			 if FALSE, uses HasType() to test, and changes typenow to fit desired phrase.
// markwt = whether to assign each TOK its Typenow required by the best sequence found
// markpt = whether to assign each TOK to the phrase type ptype
int FACT::Phrase(int startpos, int ptype, BOOL fitonly,
				BOOL markwt, BOOL markpt)
{
if((startpos < 0) || (startpos >= GetItemsInContainer()) || !toks[startpos].TypeCount())
	return(0);

// an untested possibility: since a PREPPHR is a PREP + NOUNPHR, test for those two,
// and eliminate need for PREPPHR entries in MDB LegalPhrases.  However, this will probably
// worsen the problem with infinitives, and it also will allow a PREPPHR to be wrongly
// built from some common noun phrase sequences that cannot be in PREPPHR sequences.
// if((ptype == TOK::PREPPHR) && toks[startpos].HasType(PREP))
// {
// 	int i = Phrase(startpos + 1, TOK::NOUNPHR);
// 	if(i > 0)
// 		return(i + 1);
// 	return(0);
// }

string typelist;
if(fitonly)
	typelist += (uchar)toks[startpos].Type();
else
	typelist = toks[startpos].TypeList;
for(int i = 0 ; i < typelist.length() ; i++)
	typelist[i] += 64;								// convert to "ABC" format

// SQL to retrieve the known-legal phrases to test against.
string topic("WTALK;SQL SELECT DISTINCTROW LegalPhrases.Seq FROM LegalPhrases "
			 "WHERE ((LegalPhrases.Seq Like \"[%s]*\") "
			 "AND ((Len([LegalPhrases].[Seq]))<=%s) "
			 "AND (LegalPhrases.PhraseType=%s)) "
			 "ORDER BY Len([LegalPhrases].[Seq]) DESC;" );
topic.substring("%s") = typelist;
topic.substring("%s") = tostring(GetItemsInContainer() - startpos);
topic.substring("%s") = tostring(ptype);

SDDEConv* chan1 = Dde->DDEInitiate("MSACCESS",topic);
if(!chan1)
	return(0);

string ABC;							// the retrieved legal phrase sequence in ABC format
int len;							// its length

// if this is too slow, could retrieve all rows at once using "Data", and then
// loop through chan1->Fields[].  change query to TOP 16000 to prevent TArray fill-up.
while(chan1->DDERequest("NextRow"))
{
	ABC = (string)(*chan1);
	len = ABC.length();
	int i, j;
	BOOL isok;
	for(i = 0, j = startpos ; i < len ; i++, j++)
	{
		isok = (fitonly ? toks[j].TypeIs(ABC[i] - 64) : toks[j].HasType(ABC[i] - 64));
		if(!isok)
			break;
	}
	if(i == len)					// if it made it all the way through the loop,
	{                               // all of ABC was matched
		for(i = 0, j = startpos ; i < len ; i++, j++)
		{
			if(markwt) toks[j].SetTypenow(ABC[i] - 64);
			if(markpt) toks[j].PhraseType = ptype;
		}
		Dde->DDETerminate(chan1);
		return(len);              	// the longest possible phrase length.
	}
}
Dde->DDETerminate(chan1);
return(0);
}                  				//Phrase
//---------------------------------------------------------------------------
// returns the TYPE of the longest phrase it could create starting at startpos,
// or TOK::UNASSIGNED(0), meaning that that no phrase could begin at that startpos.
// use this fn to determine WHICH type of phrase you want to begin at startpos.
//
// startpos = index within toks[] to begin testing from
// markwt = whether to assign each TOK its Typenow required by the best sequence found
// markpt = whether to assign each TOK to the phrase type that it determines is best.
int FACT::AnyPhrase(int startpos, BOOL markwt, BOOL markpt)
{
if((startpos < 0) || (startpos >= GetItemsInContainer()) || !toks[startpos].TypeCount())
	return(TOK::UNASSIGNED);

string typelist = toks[startpos].TypeList;
for(int i = 0 ; i < typelist.length() ; i++)
	typelist[i] += 64;								// convert to "ABC" format

// SQL to retrieve the known-legal phrases to test against.
// #error manually edited, unchecked by the MSAccess QBE grid
string topic("WTALK;SQL SELECT DISTINCTROW LegalPhrases.PhraseType, LegalPhrases.Seq "
			 "FROM LegalPhrases "
			 "WHERE ((LegalPhrases.Seq Like \"[%s]*\") "
			 "AND ((Len([LegalPhrases].[Seq]))<=%s)) "
			 "ORDER BY Len([LegalPhrases].[Seq]) DESC, LegalPhrases.PhraseType;" );
topic.substring("%s") = typelist;
topic.substring("%s") = tostring(GetItemsInContainer() - startpos);

SDDEConv* chan1 = Dde->DDEInitiate("MSACCESS",topic);
if(!chan1)
	return(TOK::UNASSIGNED);

string ABC;							// the retrieved legal phrase sequence in ABC format
int len;							// its length
int ptype;							// PhraseType of best fit phrase
while(chan1->DDERequest("NextRow"))
{
	fromstring(ptype, chan1->Fields[0]);
	ABC = chan1->Fields[1];
	len = ABC.length();
	int i, j;
	for(i = 0, j = startpos ; i < len ; i++, j++)
		if(!toks[j].HasType(ABC[i] - 64))
			break;
	if(i == len)	// if it made it all the way through the loop, all of ABC was matched
	{
		for(i = 0, j = startpos ; i < len ; i++, j++)
		{
			if(markwt) toks[j].SetTypenow(ABC[i] - 64);
			if(markpt) toks[j].PhraseType = ptype;
		}
		Dde->DDETerminate(chan1);
		return(ptype); 				// the type of the longest possible phrase.
	}
}
Dde->DDETerminate(chan1);
return(TOK::UNASSIGNED);
}                  				//AnyPhrase
//---------------------------------------------------------------------------
// mark consecutive tokens as belonging to a phrase.
// returns index within array of the first token after end of the marked phrase
// (so that a new search can begin at that location),
// returns an index beyond end of array if it received invalid parameters
// or if marking went to last token.
// pos = index within toks to begin marking at
// count = number of tokens to mark
// ptype = type of phrase
// overwrite = whether to overwrite an already marked phrasetype (default = TRUE)
int FACT::MarkPhrase(int pos, int count, TOK::phrasetypes ptype, BOOL overwrite)
{
int tokcount = GetItemsInContainer();	// total tokens in sentence
if((pos < 0) || (pos >= tokcount))			// invalid index
	return(tokcount);

for(int i = 0 ; (i < count) && (pos < tokcount) ; i++, pos++)  	// stop at count or last tok
{
	// ok to overwrite only if unmarked or already the type being marked
	if(!overwrite)
		if((toks[pos].PhraseType != ptype) && (toks[pos].PhraseType != TOK::UNASSIGNED))
			break;
	toks[pos].PhraseType = ptype;
}
return(pos);
}                   	//MarkPhrase
//---------------------------------------------------------------------------
// mark consecutive tokens as having an infotype.
// returns index within array of the first token after end of the marked phrase
// returns index beyond end of array if it received invalid parameters
// 			or if marking went to last token.
// pos = index within toks to begin marking at
// count = number of tokens to mark
// itype = infotype to use
// overwrite = whether to overwrite an already marked infotype (default = TRUE)
int FACT::MarkInfotypes(int pos, int count, int itype, BOOL overwrite)
{
int tokcount = GetItemsInContainer();	// total tokens in sentence
if((pos < 0) || (pos >= tokcount))			// invalid index
	return(tokcount);

for(int i = 0 ; (i < count) && (pos < tokcount) ; i++, pos++)  	// stop at count or last tok
{
	// ok to overwrite only if unmarked or already the type being marked
	if(!overwrite)
		if((toks[pos].InfoType != itype) && (toks[pos].InfoType != TOK::MISC))
			break;
	toks[pos].InfoType = itype;
}
return(pos);
}                   	// MarkInfotypes
//---------------------------------------------------------------------------
BOOL FACT::UploadToTOKSTable()
{
SDDEConv* chan1 = Dde->DDEInitiate("MSACCESS","WTALK");
if(!chan1)
	return(FALSE);

chan1->DDEExecute("[SetWarnings 0]");
string sql = "[RUNSQL \"DELETE * FROM TOKS;\"]";
chan1->DDEExecute(sql);
for(int i = 0 ; i < GetItemsInContainer() ; i++)
{
	sql = "[RUNSQL \"INSERT INTO TOKS ([AlphaID], [Type], [PhraseType], [InfoType]) "
		  "VALUES (%s, %s, %s, %s);\"]";
	sql.substring("%s") = tostring(toks[i].AlphaID);
	sql.substring("%s") = tostring(toks[i].Type());
	sql.substring("%s") = tostring(toks[i].PhraseType);
	sql.substring("%s") = tostring(toks[i].InfoType);

	chan1->DDEExecute(sql);
}
Dde->DDETerminate(chan1);
return(TRUE);
}                         		//UploadToTOKSTable
//---------------------------------------------------------------------------
// returns the appropriate text translation of the given numeric sentype
string FACT::sentypetoalpha(sentypes type)
{
if(type == UNKNOWN)   	{ return("Unknown"); }
if(type == STATEMENT)   { return("Statement"); }
if(type == QUESTION)    { return("Question"); }
if(type == ANSWERYES)   { return("AnswerYES"); }
if(type == ANSWERNO)   	{ return("AnswerNO"); }
if(type == ANSWERIND)   { return("AnswerIND"); }
if(type == DESCRIPTION) { return("Description"); }
if(type == COMMAND)    	{ return("Command"); }
if(type == EXCLAMATION) { return("Exclamation"); }
if(type == PARTIAL)   	{ return("Partial"); }
return("----");
}                        	//sentypetoalpha
//---------------------------------------------------------------------------
// returns the appropriate text translation of the given numeric qtype
string FACT::qtypetoalpha(qtypes type)
{
if(type == QADVICE)   	{ return("QAdvice"); }
if(type == QMISC)   	{ return("QMisc"); }
if(type == QWHO)    	{ return("QWho"); }
if(type == QDIDWHAT)	{ return("QDidWhat"); }
if(type == QTOWHOM)   	{ return("QToWhom"); }
if(type == QWHEN)   	{ return("QWhen"); }
if(type == QWHERE)   	{ return("QWhere"); }
if(type == QWHY)    	{ return("QWhy"); }
if(type == QHOW)    	{ return("QHow"); }
if(type == QCOND)   	{ return("QCond"); }
if(type == QYESNO)    	{ return("QYesNo"); }
if(type == QLOOKUP)   	{ return("QLookup"); }
return("----");
}                  			//qtypetoalpha
//---------------------------------------------------------------------------
// returns the appropriate text translation of the given verb tense.
string FACT::tensetoalpha(tenses type)
{
if(type == NONE)   	{ return("None"); }
if(type == PAST)   	{ return("Past"); }
if(type == PRESENT) { return("Present"); }
if(type == FUTURE)	{ return("Future"); }
return("----");
}                  			//tensetoalpha
//---------------------------------------------------------------------------
//						end class FACT
//////////////////////////////////////////////////////////////////////////////

ddeparse.cpp

/*	ddeparse.cpp         	9-5-01
	This is part of the WTalk project.
	Copyright (C)1993-2001 by Steven Whitney
	Published under GNU GPL (General Public License) Version 2, with ABSOLUTELY NO WARRANTY.
	Initially published by http://25yearsofprogramming.com.

This is the code for the FACT::DDEParse() function.

5/18/2006
A FACT is essentially a parsed sentence or phrase, though if I recall correctly, a long
compound sentence containing multiple statements may get broken into multiple FACTS.

FACT::Parse, which is now FACT::DDEParse, is the function that determines the type of statement
that this fact is, identifies the subject, verb, objects, what phrase types it contains, and
what types of information (who, what, when, where, why) are conveyed by its various parts.
Ultimately it also renders a judgment whether the statement was fully parseable.

This function is a separate source file (and project node) because it was a very long time in
development and once existed simultaneously in multiple versions tested against each other,
selected by the #define at the top of the file.

The DDE refers to the fact that in contrast to earlier versions that used lists of words stored
in text files, this one makes extensive use of information stored in the WTALK.MDB database,
which it accesses using SQL via DDE, a method used extensively throughout the program.

Early parsing attempts in this program used a state machine alone and later in combination with
a context-free recursive descent parser which attempted to do grammatical analysis.  These
approaches hit dead ends, quickly reaching the limits of their abilities (which even at that
point were poor) even when used in combination with early versions of the inferential
techniques that this function now uses.  In developing this function, I reached a conclusion:
people do not extract meaning from sentences using grammatical analysis.  They do not even
extract meaning from sentences linearly.  Rather, the information in sentences jumps out at them,
in chunks that they instantly recognize as meaningful, and they likewise instantly know, using
cues they've learned from experience, what types of meaning it contains.  This is discussed at
greater length in talk.doc.

Accordingly, I developed a set of pragmatic methods and specific tests that pick apart the
sentence to try to identify the meaning-containing parts of it and tag them appropriately.
These specific tests outperformed the state machine and recursive descent parser.

Basically, I wound up creating a "bag of tricks" that did the information extraction based on
various clues contained in the sentence and in its words, and it turned out that this
assortment of cheap tricks outperformed the more strictly analytical methods often enough
to make them more useful.

------
To do:

*/
#include "fact.h"
#pragma hdrstop

// PARSE VERSIONS: (there is now only 1 version)
// USING LOCAL EQUIV OF ANYPHRASE() = 1

#define PARSEVERSION	1

//////////////////////////////////////////////////////////////////////////////
#if (PARSEVERSION == 1)
//////////////////////////////////////////////////////////////////////////////
//---------------------------------------------------------------------------
// assigns tokens to phrases and marks phrases by the infotype they contain.
// returns TRUE if at least subject and verb are present, else FALSE
//
int FACT::DDEParse()
{
int tokcount = GetItemsInContainer();	// number of tokens in this sentence
if(!tokcount)							// nothing to process
	return(FALSE);

int i;
BOOL gotwho = FALSE, gotaverb = FALSE;

//-------------------------------------
// DETERMINE SENTYPE, USING FINAL TOKEN FOR INITIAL TESTS:
string s = toks[tokcount-1];				// copy last token
if((s == ".") || (s == ";"))
	sentype = STATEMENT;
else
	if(s.contains("?"))        				// allows ??, ???, ?!, etc.
		sentype = QUESTION;
	else
		if(s.contains("!"))					// !? question already handled above; this allows "!!!"
			sentype = EXCLAMATION;			// use somewhere: it indicates user interest
		else
			if((s == ",") || (s == ":") || (s.contains("...")))
				sentype = PARTIAL;

//-------------------------------------
// CHECK FOR ANSWERYES, ANSWERNO, ANSWERIND
// retrieve list of all MDB entries with the relevant infotypes;
// if any is found within the first few words, mark the FACT as the appropriate answer type.
// order is intentional: later determinations override earlier ones.
string topic("WTALK;SQL SELECT DISTINCT Phrases.Alpha, Defs.InfoType, Defs.UseCount "
			 "FROM Phrases INNER JOIN Defs ON Phrases.AlphaID = Defs.AlphaID "
			 "WHERE ((Defs.InfoType=9 Or Defs.InfoType=10 Or Defs.InfoType=11)) "
			 "ORDER BY Defs.InfoType DESC , Defs.UseCount DESC;");
SDDEConv* chan1 = Dde->DDEInitiate("MSACCESS",topic);
if(chan1)
{
	string alpha;
	int infotype;
	while(chan1->DDERequest("NextRow"))
	{
		fromstring(infotype, chan1->Fields[1]);

		// once you have a value, skip over all remaining tests in its section
		if( ((sentype == ANSWERIND) && (infotype == TOK::ANSIND)) ||
			((sentype == ANSWERNO) && (infotype == TOK::ANSNO)) ||
			((sentype == ANSWERYES) && (infotype == TOK::ANSYES)) )
				continue;

		alpha = chan1->Fields[0];
		if(Sentence.find(alpha) < 20)      	// crude preliminary string search (fast)
			if(Find(FACT(alpha,FALSE)) < 5) // more definitive token search (slow: uses DDE)
				switch(infotype)			// FALSE in ctor prevents endless recursion.
				{
					case TOK::ANSYES: sentype = ANSWERYES; break;
					case TOK::ANSNO: sentype = ANSWERNO;  break;
					case TOK::ANSIND: sentype = ANSWERIND; break;
				}
	}
	Dde->DDETerminate(chan1);
}

//-------------------------------------
// IF SENTENCE IS A QUESTION, FURTHER REFINE ITS TYPE INTO A QTYPE.
// also strips the indicator words out of toks[], leaving behind only words that will
// help when searching for answers to the question, but leaves .Sentence as user entered it.
// DON'T CALL REBUILDSENTENCE!
if(sentype == QUESTION)
{
	uint i, j;

	// test for some other variations of this question type
	if((toks[0] == "who") && Phrase(1, TOK::VERBPHR, FALSE, FALSE, FALSE))
	{
		qtype = QWHO;
		toks.Destroy(0);			// delete the indicator word's token
	}
	if( (Sentence.startswith("who did ") || Sentence.startswith("what did ")) &&
		(Find("do") == INT_MAX) )		// prevents confusion with didwhat test below
	{
		qtype = QTOWHOM;
		toks.Destroy(0);		// "who"
		toks.Destroy(0);  		// "did": the 2nd token became the first; delete it, too
	}
	if(((i = Sentence.find("what did")) != NPOS) && ((j = Find("do")) != INT_MAX) && (j > i))
	{
		qtype = QDIDWHAT;
		toks.Destroy(j); toks.Destroy(i); toks.Destroy(i);	// must destroy j first!
	}
	if((i = Sentence.find("why did")) != NPOS)
	{
		qtype = QWHY;
		toks.Destroy(i); toks.Destroy(i);
	}
	if(Sentence.startswith("when did"))
	{
		qtype = QWHEN;
		toks.Destroy(0); toks.Destroy(0);
	}
	if(Sentence.startswith("where did"))
	{
		qtype = QWHERE;
		toks.Destroy(0); toks.Destroy(0);
	}
	if(Sentence.startswith("how did"))
	{
		qtype = QHOW;
		toks.Destroy(0); toks.Destroy(0);
	}
	if(	((i = Sentence.find("under what circumstances")) != NPOS) ||
		((i = Sentence.find("under what conditions")) != NPOS) ||
		((i = Sentence.find("on what conditions")) != NPOS) )
	{
		qtype = QCOND;
		toks.Destroy(i); toks.Destroy(i); toks.Destroy(i);
	}
	// check first tok AND each tok that follows a comma for being a YNQ flag.
	// i is a uint, so INT_MAX + 1 is ok.
	for(i = 0 ; i < GetItemsInContainer() ; i = Find(",",i + 1) + 1)
		if(YNQStarters.HasMember(toks[i]) || iswords.HasMember(toks[i]))
		{
			qtype = QYESNO;
			break;
		}
	if((i = Sentence.find("what is a")) != NPOS)
	{
		qtype = QLOOKUP;
		toks.Destroy(i); toks.Destroy(i); toks.Destroy(i);
	}
	// QMISC not handled.  not sure what it should mean.
}	// END FURTHER REFINE A QUESTION'S TYPE INTO QTYPE.

tokcount = GetItemsInContainer();		// toks may have been deleted above!

//-------------------------------------
// search for specific words that reliably begin phrases with particular infotypes.
// order of IF,BECAUSE,WHEN is intentional.  each could overwrite the previous markings,
// so the ones most often correct when they appear together in a phrase should be last.
// phrase is only marked if user provided required punct, which is appropriate.
// (it's hard enough to parse good grammar, let alone sloppy!)
//
// These tests are first because they remove from consideration dependent clauses that
// otherwise may confuse gotwho and gotaverb.  If you need more
// detailed info from the phrases marked here, you must extract them, strip away the
// if/when/because keyword, and reparse.

// It might be possible to move these into the loop below.

// "IF": mark everything up to (excluding) next "," or terminator as infotype COND
if((i = Find("if")) != INT_MAX)
	for(int k = i + 1 ; k < tokcount ; k++)
		if((toks[k] == ",") || toks[k].TypeIs(TERM))
		{
			MarkInfotypes(i, k - i, TOK::COND, TRUE);
			MarkPhrase(i, k - i, TOK::SUBORD, TRUE); // to prevent its further consideration
			break;
		}
// "WHEN": mark everything up to (excluding) next "," or terminator as infotype WHEN
if((i = Find("when")) != INT_MAX)
	for(int k = i + 1 ; k < tokcount ; k++)
		if((toks[k] == ",") || toks[k].TypeIs(TERM))
		{
			MarkInfotypes(i, k - i, TOK::WHEN, TRUE);
			MarkPhrase(i, k - i, TOK::SUBORD, TRUE);
			break;
		}
// "BECAUSE": mark everything up to (excluding) next "," or terminator as infotype WHY
if((i = Find("because")) != INT_MAX)
	for(int k = i + 1 ; k < tokcount ; k++)
		if((toks[k] == ",") || toks[k].TypeIs(TERM))
		{
			MarkInfotypes(i, k - i, TOK::WHY, TRUE);
			MarkPhrase(i, k - i, TOK::SUBORD, TRUE);
			break;
		}

//-------------------------------------
// SEE IF EACH TOK IS THE START OF A KNOWN MULTI-WORD PHRASE WITH AN IDIOMATIC SEQUENCE.
// like the previous section, this is done before the next section to pre-assign for
// known idioms *throughout the sentence*; next section won't overwrite.
// When it was inside next loop, idioms had to also begin their phrase to be processed at all.
// by being here, you do lose knowledge of variables that may not be set until later
// (gotwho, gotaverb, sentype as DESC, etc.), but many idioms carry their own phrase
// and infotype info (and should whenever possible).
for(i = 0 ; i < tokcount ; i++)
{
	// skip any tok already assigned to a phrase,
	if((toks[i].PhraseType != TOK::UNASSIGNED))
		continue;

	string topic;
	SDDEConv* chan1 = 0;
	chan1 = Dde->DDEInitiate("MSACCESS","WTALK;SQL ");
	if(!chan1)
		return(FALSE);
	topic =
		"SELECT DISTINCTROW Phrases.Alpha, Defs.Seq, Defs.PhraseType, Defs.InfoType "
		"FROM Phrases INNER JOIN Defs ON Phrases.AlphaID = Defs.AlphaID "
		"WHERE ((Phrases.Alpha Like \"%s*\" And Phrases.Alpha Like \"* *\") "
		"AND ((Len([seq]))>1 And (Len([seq]))<=%s)) "
		"ORDER BY Len([seq]) DESC , Defs.UseCount DESC , Defs.LastUsed DESC;";
	topic.substring("%s") = (string)toks[i];
	topic.substring("%s") = tostring(GetItemsInContainer() - i);

	chan1->DDEPoke("SQLTEXT",topic.substr(0,250));
	chan1->DDEPoke("SQLTEXT",topic.substr(250));
	while(chan1->DDERequest("NextRow"))
	{
		string alpha = chan1->Fields[0];
		// CRUDE PRELIMINARY STRING SEARCH (FAST) MORE DEFINITIVE TOKEN SEARCH (SLOW)
		if((Sentence.find(alpha) == NPOS) || (Find(FACT(alpha,FALSE), i) != i))
			continue;

		string ABC = chan1->Fields[1];
		int ptype = TOK::UNASSIGNED, itype = TOK::MISC;
		fromstring(ptype, chan1->Fields[2]);
		fromstring(itype, chan1->Fields[3]);

		int m, n;
		for(m = 0, n = i ; m < ABC.length() ; m++, n++)
		{
			toks[n].SetTypenow(ABC[m] - 64);
			toks[n].PhraseType = ptype;
			toks[n].InfoType = itype;
		}
		break;		// sorted by descending length, so first is longest & we're done
	}
	Dde->DDETerminate(chan1);
}									// for(i = 0 ; i < tokcount ; i++)

//-------------------------------------
// USE PHRASE() TO IDENTIFY AND MARK VARIOUS PHRASE TYPES
// toks (usually) already have as defaults their most common typenow, infotype
// This version is supposed to duplicate version 3, but allow more flexible decision making.
//-------------------------------------
// RUN SEQUENTIALLY THROUGH TOKS, DETERMINING WHICH PHRASETYPE BEGINS *BEST* AT EACH TOK
BOOL localgotwho = FALSE, localgotaverb = FALSE;
int phraselength = 1;
for(i = 0 ; i < tokcount ; i += max(phraselength, 1))
{
	// skip any tok with no available types (which can't start a phrase, anyway).
	// a previous version also skipped any individual tok with a preassigned phrasetype,
	// but that is not desirable because it prevents extending its phrase, which is common.
	if(!toks[i].TypeCount())
		continue;

	//-------------------------------------
	// ", AND" ", OR" AND ", BUT" RESET LOCALGOTWHO AND LOCALGOTAVERB SO YOU CAN GET
	// THEM AGAIN IN A NEW DEPENDENT OR INDEPENDENT CLAUSE.  NOTE THAT ANY AND/OR/BUT WITHIN
	// A PHRASE GETS INCORPORATED INTO THE PHRASE, AND WON'T CAUSE RESET HERE.
	if(i > 0)
		if(toks[i-1] == ",")
			if((toks[i] == "and") || (toks[i] == "or") || (toks[i] == "but"))
				localgotwho = localgotaverb = FALSE;

	//-------------------------------------
	// CASES WHERE WE KNOW WHAT KIND OF PHRASE WE PREFER, IF THERE IS AN AMBIGUOUS CHOICE.
	int preferredptype = TOK::UNASSIGNED;		// 0
	if(!localgotwho)
		preferredptype = TOK::NOUNPHR;
	if(localgotwho && !localgotaverb)
		preferredptype = TOK::VERBPHR;

	//-------------------------------------
	// SET UP THE LEGALPHRASES QUERY:
	string typelist = toks[i].TypeList;
	for(int j = 0 ; j < typelist.length() ; j++)
		typelist[j] += 64;							// convert to "ABC" format

	// SQL TO RETRIEVE THE KNOWN-LEGAL PHRASES TO TEST AGAINST.
	// THIS SECTION MODIFIED FROM ANYPHRASE(), BROUGHT OVER HERE TO ALLOW
	// GETTING ANY ITEM IN THE LIST, NOT JUST THE FIRST ROW.
	// How to sort for retrieval seems an important choice.  You don't want the longest,
	// which means the longest ever seen, nor the shortest possible, often too short.
	// Your best first choice is the most common (mode) or average (mean or median) length
	// that is commonly seen for that type of phrase.  Including "most recent" might
	// take into account speech patterns of current user.  Here, as elsewhere,
	// anything you can do to increase probability that first choice is correct can
	// save a lot of re-analysis trouble later, and, maybe even more important,
	// increases probability of being accidentally correct.

	string topic;
	// version 1: chooses longest.  works well, but frequently overreaches
	topic = "WTALK;SQL SELECT DISTINCTROW LegalPhrases.PhraseType, LegalPhrases.Seq "
			"FROM LegalPhrases "
			"WHERE ((LegalPhrases.Seq Like \"[%s]*\") "
			"AND ((Len([LegalPhrases].[Seq]))<=%s)) "
			"ORDER BY Len([LegalPhrases].[Seq]) DESC, LegalPhrases.PhraseType;";

	// version 2: chooses most often used, untested.  may be poor: most often used
	// is usually shortest.  Probably better to use longest, and apply any needed fixes.
// 	topic = "WTALK;SQL SELECT DISTINCTROW LegalPhrases.PhraseType, LegalPhrases.Seq "
// 			"FROM LegalPhrases "
// 			"WHERE ((LegalPhrases.Seq Like \"[%s]*\") "
// 			"AND ((Len([LegalPhrases].[Seq]))<=%s)) "
// 			"ORDER BY LegalPhrases.UseCount DESC , LegalPhrases.LastUsed DESC , "
// 			"LegalPhrases.PhraseType;";

	topic.substring("%s") = typelist;
	topic.substring("%s") = tostring(GetItemsInContainer() - i);

	SDDEConv* chan1 = Dde->DDEInitiate("MSACCESS",topic);
	if(!chan1)                 // IF DDE HAS FAILED, PARSING IS THE LEAST OF YOUR WORRIES
		return(FALSE);

	//-------------------------------------
	// START RETRIEVING AND ANALYZING QUERY LINES

	// retrieved legal phrase sequences in ABC format:
	string fitABC,		// sequence that is legal without changing any word's typenow
		   anyABC,		// sequence that is legal by changing typenows however necessary
		   ABC;			// the sequence finally chosen
	int fitptype, anyptype, ptype, k; 				// same as described above

	fitptype = anyptype = ptype = TOK::UNASSIGNED;
	phraselength = 0;
	while(chan1->DDERequest("NextRow"))
	{
		fromstring(ptype, chan1->Fields[0]); 	// both used as temporaries here,
		ABC = chan1->Fields[1];					// overwritten later

		//--------------------
		// THIS FIRST LOOP DETERMINES WHETHER THIS SEQ IS ELIGIBLE AS FITABC.
		// you only want a new value if: you don't have ANY value yet, OR
		// you do have a value, but it's not the preferred one, and the new value is
		// the preferred one.
		if( (fitptype == TOK::UNASSIGNED) ||
			 ((fitptype != preferredptype) && (ptype == preferredptype)) )
		{
			// abc was already queried such that its length can't extend beyond tokcount
			for(j = 0, k = i ; j < ABC.length() ; j++, k++)
			{
				if(!toks[k].TypeIs(ABC[j] - 64))
					break;
			}
			if(j == ABC.length())		// if it survived the loop, it's eligible
			{
				fitABC = ABC;
				fitptype = ptype;
			}
		}
		//--------------------
		// THIS SECOND LOOP DETERMINES WHETHER THIS SEQ IS ELIGIBLE AS ANYABC.
		if( (anyptype == TOK::UNASSIGNED) ||
			 ((anyptype != preferredptype) && (ptype == preferredptype)) )
		{
			for(j = 0, k = i ; j < ABC.length() ; j++, k++)
			{
				// IF YOU RUN INTO A PRE-MARKED PHRASE, INCORPORATE IT INTO THIS PHRASE,
				// BUT ONLY IF EACH WORD ALREADY HAS THE TYPE YOU NEED (JUST LIKE FITONLY).
				BOOL isok;
				if(toks[k].PhraseType != TOK::UNASSIGNED)
					isok = toks[k].TypeIs(ABC[j] - 64);
				else
					isok = toks[k].HasType(ABC[j] - 64);
				if(!isok)
					break;
			}
			if(j == ABC.length())		// if it survived the loop, it's eligible
			{
				anyABC = ABC;
				anyptype = ptype;
			}
		}
		//--------------------
		// DECIDE WHETHER THE SEQ CHOICES YOU HAVE ARE GOOD ENOUGH, AND YOU CAN QUIT LOOKING.
		if(preferredptype != TOK::UNASSIGNED)
		{
			// if you have preferred value for both.
			// if this is too slow, add more lenient tests:
			// preferred value for either, or ANY value for both, etc.
			if( (fitptype == preferredptype) && (anyptype == preferredptype) )
				break;
		}
		else
		{
			// if nothing is preferred, any value for both is sufficient to quit
			if( (fitptype != TOK::UNASSIGNED) && (anyptype != TOK::UNASSIGNED) )
				break;
		}
		//--------------------
		// MORE TESTS WERE PLANNED FOR SOMEWHERE IN THIS LOOP TO VERIFY OR EXCLUDE A SEQ.
		// methods to remember:
		// you can go back with "PrevRow" DDEitem.
		// you can restart the whole process for or from this or previous (or any) tok using:
		// { Dde->DDETerminate(chan1); i -= n (but compensate for auto-i++); continue; }
		// CONTEXT AND MDB LOOKUP TESTS WOULD GO HERE.

	}								// end 	while(chan1->DDERequest("NextRow"))
	Dde->DDETerminate(chan1);

	//--------------------
	// MAKE FINAL CHOICE WHICH ABC SEQUENCE TO USE
	// if you got no value for either, it will simply do nothing and move on.

	// #error unfinished.  in brief testing, the 2 do often differ, and neither is
	// consistently the correct choice, so they're probably useful.
	// must figure out which is mostly likely correct under which circumstances.

	// "fit" is less likely to overrun end of true phrase, a common problem with VERBPHR
	if(fitptype != TOK::UNASSIGNED)
	{
		ABC = fitABC;
		ptype = fitptype;
	}
	else
	{
		if(anyptype != TOK::UNASSIGNED)
		{
			ABC = anyABC;
			ptype = anyptype;
		}
	}

	//--------------------
	// AT THIS POINT, DECISION IS FINAL TO USE THIS ABC SEQUENCE:
	// MUST MARK THE PHRASE *HERE* EXPLICITLY TO MATCH THE ABC WE WANT.
	// PHRASE() WOULD MAKE ITS OWN NEW DECISION ABOUT WHICH SEQ MATCHES BEST.
	phraselength = ABC.length();
	for(j = 0, k = i ; j < phraselength ; j++, k++)
	{
		toks[k].SetTypenow(ABC[j] - 64);
		toks[k].PhraseType = ptype;
	}
	switch(ptype)		// SPECIAL PROCESSING FOR THE TYPE OF PHRASE YOU JUST CHOSE
	{
		case TOK::NOUNPHR:
		{
			// ASSUME FIRST NOUNPHRASE ENCOUNTERED IS SUBJECT, AND ANY SUBSEQUENT ARE OBJECT
			// IN A DESCRIPTION SENTENCE, DIRECT OBJECT IS ACTUALLY AN EQUIVALENCY
			// 12 EQUIVALENCY: [X] IS A [NOUNPHRASE]
			int objtype = ((sentype == DESCRIPTION) ? TOK::EQUIV : TOK::TOWHOM);
			MarkInfotypes(i,phraselength,(localgotwho ? objtype : TOK::WHO),TRUE);
			gotwho = localgotwho = TRUE;
			break;
		}
		case TOK::VERBPHR:
		{
			MarkInfotypes(i,phraselength,TOK::DIDWHAT,TRUE);
			// APPLY VARIOUS TESTS TO EACH WORD IN THE PHRASE
			for(j = 0, k = i ; (j < phraselength) && (k < tokcount) ; j++, k++)
			{
				// TEST THE WORD FOR BEING AN "ISWORD"
				// BE CAREFUL: YOU CAN'T ALLOW JUST ANY SINGLE ISWORD TO TURN ENTIRE SENTENCE
				// INTO EQUIV
				if(!gotaverb && iswords.HasMember(toks[k]))
				{
					// TEST FOR TWO SPECIAL IS-WORDS ("WAS/WERE [VERB]ING...") AS PAST TENSE
					if( ((toks[k] == "was") || (toks[k] == "were")) && IsIndex(k+1) &&
						((string)(toks[k+1])).endswith("ing") )
							tense = PAST;
					else							// OTHERWISE, ASSUME IS-WORDS INDICATE
						if(sentype != QUESTION)
							sentype = DESCRIPTION;	// A DESCRIPTION OR EQUIVALENCY SENTENCE.

					// TEST FOR "[AM, ETC.] GOING TO..." AS A FUTURE TENSE
					// IF IT'S NOT FOLLOWED BY A NOUN PHRASE
					if( ((toks[k] == "am") || (toks[k] == "are") || (toks[k] == "is")) &&
						IsIndex(k+2) && (toks[k+1] == "going") && (toks[k+2] == "to") &&
						!Phrase(k+2, TOK::PREPPHR, TRUE, FALSE, FALSE) )
							tense = FUTURE;
				}
				string s = toks[k];

				// TEST FOR "...ED" AS THE PAST TENSE OF A VERB
				if(s.endswith("ed") && toks[k].TypeIs(VERB))
					tense = PAST;

				// TEST FOR "WILL..." AS A FUTURE TENSE
				if((s == "will") && IsIndex(k+1) && toks[k+1].TypeIs(VERB))
					tense = FUTURE;
			}
			// if the last tok in phrase is ADV, if previous tok is VERB or ADV,
			// and following might be ADJ, disassociate the ADV from this phrase, so it
			// can join the ADJ, which is more often correct.
			j = i + phraselength - 1;
			if(IsIndex(j+1) && IsIndex(j-1) &&
				(toks[j-1].TypeIs(VERB) || toks[j-1].TypeIs(ADV)) &&
				toks[j].TypeIs(ADV) && toks[j+1].HasType(ADJ))
					phraselength--;  	// forces the ADV to be the next tok processed

			gotaverb = localgotaverb = TRUE;
			break;
		}
		case TOK::PREPPHR: 		// ANY INFOTYPE THESE HAVE IS USUALLY RELIABLE
			// IF ANY WORD IN THE PHRASE HAS AN INFOTYPE, IT OVERRIDES THAT OF THE PREP.
			// THIS ALLOWS, E.G., WHENNOUNS TO CAUSE THEIR ENTIRE PHRASE TO BE MARKED WHEN,
			// EVEN WITH A PREP THAT MAY NORMALLY HAVE A DIFFERENT INFOTYPE.
			for(j = 1, k = i + 1 ; (j < phraselength) && (k < tokcount) ; j++, k++)
				if(toks[k].InfoType != TOK::MISC)
				{
					toks[i].InfoType = toks[k].InfoType;
					break;									// uses the FIRST one found
				}
			MarkInfotypes(i,phraselength,toks[i].InfoType, TRUE);
			break;

		case TOK::ADVPHR:
			// 1. IF THE ADV HAS AN INFOTYPE, COPY TO REST OF PHRASE,
			// 2. ELSE FIND ANY WORD IN PHRASE WITH AN INFOTYPE, AND USE THAT,
			// 3. ELSE ASSUME PHRASE MODIFIES THE NONADJACENT VERB PHRASE.
			if(toks[i].InfoType == TOK::MISC)
			{
				for(j = 1, k = i + 1 ; (j < phraselength) && (k < tokcount) ; j++, k++)
					if(toks[k].InfoType != TOK::MISC)
					{
						toks[i].InfoType = toks[k].InfoType;
						break;									// uses the FIRST one found
					}
				if(toks[i].InfoType == TOK::MISC)
					toks[i].InfoType = TOK::DIDWHAT;
			}
			MarkInfotypes(i,phraselength,toks[i].InfoType,TRUE);
			break;

		case TOK::ADJPHR:
			toks[i].InfoType = TOK::DESCR;		// 13 DESCRIPTION: [X] IS [ADJPHRASE]
			if(sentype != DESCRIPTION)
			{
				// IF ANY WORD LATER IN THE PHRASE HAS AN INFOTYPE, USE IT AS LAST RESORT
				for(j = 1, k = i + 1 ; (j < phraselength) && (k < tokcount) ; j++, k++)
					if(toks[k].InfoType != TOK::MISC)
					{
						toks[i].InfoType = toks[k].InfoType;
						break;									// uses the FIRST one found
					}
			}
			MarkInfotypes(i,phraselength,toks[i].InfoType,TRUE);
			break;

		case TOK::SUBORD:
			break;
		default:
			break;
	}								// end switch
}									// end for(i = 0 ; i < tokcount ; i++)
//-------------------------------------
//-------------------------------------

return(gotwho && gotaverb);
}                        		//DDEParse
//---------------------------------------------------------------------------
//								end class FACT
//////////////////////////////////////////////////////////////////////////////
#endif	// #if (PARSEVERSION == 1)

factmem.cpp

/*	factmem.cpp         	6-13-00
	This is part of the WTalk project.
	Copyright (C)1993-2000 Steven Whitney.
	Published under GNU GPL (General Public License) Version 2, with ABSOLUTELY NO WARRANTY.
	Initially published by http://25yearsofprogramming.com.

Code for the FactMemory class

------
To Do:

------
Notes:

*/
#include "fact.h"
#pragma hdrstop

//////////////////////////////////////////////////////////////////////////////
// class FactMemory
//----------------------------------------------------------------------------
// constructor
FactMemory::FactMemory() : TIArrayAsVector<FACT>(40,0,10)
{
// #error auto-load from disk
}                      			//constructor
//----------------------------------------------------------------------------
// destructor
FactMemory::~FactMemory()
{
// #error auto-save to disk
}                          		//destructor
//----------------------------------------------------------------------------
// extract sentences from a block of source text, and use them to create and add FACTs.
// returns number of FACTs successfully created and added.
uint FactMemory::AddFacts(const string& s)
{
uint count = 0;
uint startpos = 0;
string t;
while((startpos = getsentence(s,startpos,t)) != 0)	// getsentence may add a terminator
{
	FACT* newfact = new FACT(t);
	// maybe also test whether the newfact is partial. if so, append it to previous one?
	// or base decision on whether previous fact was partial?
	if(newfact->IsGood())  				// if tokenization was successful
	{
		if(Add(newfact)) 		       	// add it to the facts database.
			count++;
	}
	else
		delete newfact;
}
return(count);
}                 	       //AddFacts
//---------------------------------------------------------------------------
// list some or all facts database entries to screen.
// returns number listed.
string FactMemory::ShowFacts(BOOL flaggedonly)
{
ostrstream os;
uint count = 0;
int N = GetItemsInContainer();
os << N << " Facts:" << endl;
for(int i = 0 ; i < N ; i++)
{
	if(flaggedonly && !((*this)[i]->flag))
		continue;
	count++;
	os << (*(*this)[i]) << endl;
}
os << N << " facts." << endl;
os << ends;
string s = os.str();
delete[] os.str();
return(s);
}               	         //ShowFacts
//----------------------------------------------------------------------------
// list token info from the last FACT entry
string FactMemory::ShowLast()
{
string s;
int i = GetItemsInContainer();
if(i)
	s = (*this)[i-1]->ShowToks();
return(s);
}               	         //ShowLast
//----------------------------------------------------------------------------
// returns a ptr to the FACT that most closely resembles the provided one, or 0.
const FACT* FactMemory::FindBestMatch(const FACT* f)
{
FACT* best = 0;
if(!f)
	return(best);
double bestscore = 0;
int count = GetItemsInContainer();
for(int i = 0 ; i < count ; i++)
{
	if((*this)[i]->sentype == FACT::QUESTION)	// exclude all questions from the search.
		continue;

	double score = (*this)[i]->Similarity(*f);
	if(score >= bestscore)						// ties go to more recent fact
	{
		best = (*this)[i];
		bestscore = score;
	}
}
return(best);
}              	            //FindBestMatch
//----------------------------------------------------------------------------
// answer the oldest unanswered user question.  Would have preferred that this be
// more modular so the steps could be in the rule action lists, (i.e. mark, then output in a
// separate step), but any mutation of the step order used here would result in nonsense.
// One advantage of the overall question-handling method is that pgm may sometimes
// seem to ignore your Qs, yet later impressively remember that it forgot to
// answer you, and do it.  (sort of a cheap trick, though)
// 
// questions have been parsed into FACTS, which they should be.  determining the clauses
// and their specific who, when, etc. infotypes should greatly help determining
// what the question is asking about, as well as allowing you to search specific
// fields in Facts[], perhaps before resorting to the Similarity measure.
// And it should allow calculating Similarity for temporary FACTs built from
// possibly relevant phrases rather than only to entire FACTs.  That is, if you KNOW
// the question is about where a given subject did a given thing, you can
// calculate Similarity between the SUBJECT of the question with only the subject
// fields in Facts, and between the VERB of the question with only the verb
// fields in Facts.  When you have the best match of both, your answer, if present, is
// in the WHERE-marked phrases of that FACT.
//
// questions are added to Facts like any other user input. They contain no useful
// information, but would LOOK as though they do in searches.
// So questions are excluded from fact searches, and are deleted once they're answered.
// They remain in the LOG.
//
string FactMemory::answerq()
{
ostrstream os;  							// for building the response
uint count = 0;								// # of facts responsive to the question
int factcount = GetItemsInContainer();
for(int i = 0 ; i < factcount ; i++)		// reset all flags
	(*this)[i]->flag = 0;
for(i = 0 ; i < factcount ; i++)			// find oldest question
	if((*this)[i]->sentype == FACT::QUESTION)
	{
		// parse stripped the indicator words out of the fact tokens,
		// but .Sentence is still the same as user entered it (although preprocessed).
		os  << "In response to your question:\n" << (*this)[i]->Sentence << endl
			<< "---------------------" << endl;
// 		const FACT* f = FindBestMatch((*this)[i]);
// 		if(f)
// 		{
			// output all facts that have the same score as the best match found.
			// (tried it: it was too strict, missing many matches)
			// FindBestMatch didn't give us the score (add a new fn that returns it instead?)
// 			double score = f->Similarity(*(*this)[i]);
			for(int j = 0 ; j < factcount ; j++)
				if((*this)[j]->sentype != FACT::QUESTION)			// exclude questions
					// old
// 					if((*this)[i]->Similarity(*(*this)[j]) >= score)
					// new: test for a threshold. score is avg match/token
					if((*this)[i]->Similarity(*(*this)[j]) >= .5) // arbitrary choice
					{
						os << (*(*this)[j]) << endl;
						count++;
					}
// 		}
// 		else
			if(!count)
				os << "I don't know." << endl;
		Destroy(i);								// delete the question
		break;									// quit. we answered 1 oldest question.
	}
os << "I found " << count << " answers." << endl;
os << ends;
string s = os.str();
delete[] os.str();
return(s);
}                     		//answerq
//----------------------------------------------------------------------------
//						end class FactMemory
//////////////////////////////////////////////////////////////////////////////

 

 

Valid HTML 4.01 Transitional Valid CSS
View content labeling at ICRA.
Copyright ©2007 Steven Whitney. Last modified 09/25/2007.