Thursday, June 18, 2009

Concordance implementation in C#3.0

using System;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace Concordance
{
class Program
{
/// Method Name: Main
/// Description:
/// This method is the main method to perform the Concordance .
/// Limitations of the Program:As of now this program is tested to identify
/// that it cannot recognize ACRONYMS like i.e. etc
///
static void Main(string[] args)
{
string strString =
"Given an arbitrary text document written in English, write a program that will generate a concordance, i.e. an alphabetical list of all word occurrences, labeled with word frequencies. Bonus: label each word with the sentence numbers in which each occurrence appeared.";
// Get the list of all words from the input text.
string[] inputTextArray =
strString.ToLower().Split(
new char[] {'.', '?', '!', ' ', ';', ':', ','},
StringSplitOptions.RemoveEmptyEntries);
// Group each word based on the occurecnces
var groupItems = from word in inputTextArray
orderby word
group word by word into wordValueandCountPair
select new { Key = wordValueandCountPair.Key, Count = wordValueandCountPair.Count() };
char[] atozCharArray =
Enumerable.Range('a', 'z' - 'a' + 1).Select(i => (Char)i).ToArray();
int indexCount = 0;
int iteration = 0;
string indexString = string.Empty;
// Print each word in the desired output format.
foreach (var item in groupItems)
{
// Get the number of sentences.
string[] SentenceArray =
Regex.Split(strString.ToLower(), @"(?<=['""a-z0-9][\.\!\?])\s+(?=[a-z])");
string paraIndexString = string.Empty;
// Loop through each sentence and find the occurence of the each word
// in that sentence to identify the senetence number.
for (int index = 0; index < SentenceArray.Length; index++)
{
var paraItems =
from paraItem in SentenceArray[index].Split(new char[] { '.', '?', '!', ' ', ';', ':', ',' }, StringSplitOptions.RemoveEmptyEntries)
where paraItem == item.Key
group paraItem by item.Key into paraIndexandCountPair
select new { Key = paraIndexandCountPair.Key, Count = paraIndexandCountPair.Count() };
// Get the occurences of each word in the selected sentence
foreach (var worditem in paraItems)
{
for (int count = 0; count < worditem.Count; count++)
{
if (!string.IsNullOrEmpty(paraIndexString))
{
paraIndexString = string.Format("{0},{1}", paraIndexString, (index + 1).ToString());
}
else
{
paraIndexString = (index + 1).ToString();
}
}
}
}
if (indexCount == atozCharArray.Length)
{
indexCount = 0;
iteration++;
indexString =
GetIndexString(atozCharArray[indexCount++], (iteration+1));
}
else
{
indexString =
GetIndexString(atozCharArray[indexCount++], (iteration+1));
}
string outPutString =
string.Format("{0,-15:D}", item.Key.ToString()) + "{" +
string.Format("{0}:{1}", item.Count.ToString(), paraIndexString) + "}";
string FormattedoutPutString =
string.Format("{0,-5:D} {1}", indexString, outPutString);
Console.WriteLine(FormattedoutPutString);
}
Console.WriteLine("Press any key to exit.");
Console.ReadKey();
}
/// Method Name: GetIndexString
/// Description:
/// This method returns the formatted string based on the input character
/// and number of character .

/// Parameter: char charachterToPrint
/// This parameter is the character toprint.

/// Parameter: int charachterCount
/// This parameter is the character count to format the output.

/// Returns:
/// - Returns the formatted string such as a. b. .... aa. bb..
///

///
static string GetIndexString(char charachterToPrint, int charachterCount)
{
string charachterToPrintString = string.Empty;
if (charachterCount != 0)
{
char[] charachterToPrintArray = new char[charachterCount];
for (int count = 0; count < charachterCount; count++)
{
charachterToPrintArray[count] = charachterToPrint;
}
charachterToPrintString =
string.Format("{0}{1}", new string(charachterToPrintArray), ".");
}
return charachterToPrintString;
}
}
}

1 comment:

Anonymous said...

Do you the program such that you can count even i.e

Search This Blog