Filter text

Filter text

Solution 

#include <iostream>

#include <fstream>

#include <string>

using namespace std;

// processing words read from file and save into the array

int load(ifstream&in, string words[]);

// save the words into the output stream

void list1(string words[], constint count, ofstream&out);

void list2(string words[], constint counts[], constint count, ofstream&out);

// sort the words alphabetically using bubble sort

void sort(string words[], constint count);

// merge the same words into one and save the count into the array

int merge(string words[], constint count, int counts[]);

int main() {

// ask the user to enter the input file name,

// assuming it ends with .txt, then generate the output file name.

string input, output1, output2;

cout<< “Enter the name of the input file (.txt): “;

cin>> input;

intlen = input.length();

output1 = input.substr(0, len – 4) + “_lower.txt”;

output2 = input.substr(0, len – 4) + “_clc.txt”;

// open the files as input and output streams

ifstream in(input);

ofstream out1(output1);

ofstream out2(output2);

// load word (processed) from input stream to an array

string words[1000];

int count = load(in, words);

// save the count into the output streams

out1 << “text size : ” << count <<endl;

out2 << “text size : ” << count <<endl;

// save words into the first output file.

list1(words, count, out1);

// sort words

sort(words, count);

// merge same words into one, then save the count into the array

int counts[1000];

count = merge(words, count, counts);

// save the result into the second output file

out2 << “vocab size : ” << count <<endl;

list2(words, counts, count, out2);

// close file streams

in.close();

out1.close();

out2.close();

cout<< “Output is saved into file ” << output1 << ” and ” << output2 <<endl;

return 0;

}

boolispunct(char c) {

return c == ‘\” || c == ‘,’ || c == ‘.’ || c == ‘-‘ || c == ‘_’ ||

c == ‘<‘  || c == ‘>’ || c == ‘(‘ || c == ‘)’ || c == ‘!’ ||

c == ‘?’ || c == ‘\\’ || c == ‘”‘ || c == ‘;’  || c == ‘:’;

}

// processing words read from file and save into the array

int load(ifstream&in, string words[]) {

int count = 0;

string token;

char word[21]; // assume each word has at most 20 characters

while (in >> token) { // read until the end of line

// process this word, remove punctuations and split into different words

int i = 0;

while (i <token.length()) {

int k = 0;

// read a word (until a punctuation is found).

while (i <token.length() && !ispunct(token[i])) {

char c = token[i];

if (c >= ‘A’ && c <= ‘Z’) // to lower case

c = c – ‘A’ + ‘a’;

word[k] = c;

k ++;

i ++;

}

i ++; // skip the punctuation

if (k > 0) { // add the word into the words list

word[k] = ‘\0’;

words[count] = string(word);

count ++;

}

}

}

return count;

}

// save the words into the output stream

void list1(string words[], constint count, ofstream&out) {

for (int i = 0; i < count; i++) {

out<< words[i] <<endl;

}

}

void list2(string words[], constint counts[], constint count, ofstream&out) {

for (int i = 0; i < count; i++) {

out<< words[i] << ” ” << counts[i] <<endl;

}

}

// sort the words alphabetically using bubble sort

void sort(string words[], constint count) {

for (int i = count; i > 1; i–) {

for (int j = 0; j < i – 1; j++) {

if (words[j] > words[j + 1]) {

// swap the two words, so the smaller one comes first

string word = words[j];

words[j] = words[j + 1];

words[j + 1] = word;

}

}

}

}

// merge the same words into one and save the count into the array

int merge(string words[], constint count, int counts[]) {

int k = 0;

if (count > 0) { // initialize

counts[0] = 1;

k ++;

}

for (int i = 1; i < count; i++) {

if (words[i] == words[k – 1]) {

// still the same word, simply increase the count

counts[k – 1] ++;

} else {

// different word, move right after [k] so the duplicate words

// between i and k are removed

words[k] = words[i];

counts[k] = 1;

k ++;

}

}

return k;

}