Velocity Reviews - Computer Hardware Reviews

Velocity Reviews > Newsgroups > Programming > C Programming > Remove repeated words from a file

Reply
Thread Tools

Remove repeated words from a file

 
 
arnuld
Guest
Posts: n/a
 
      09-18-2009
/* A C program that reads a file and copies the contents to a new file
while discarding all the repeated words.
* Written by one of my friends, posted by me on CLC for constructive
criticism. I dont' think its a standard
* C program, hence I posted it here to make it one
*
* VERSION 0.0
*
*/


#define __GNU__SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main ()
{
char str1[50] = {0};
char *array[100];
FILE *ifp,*ofp;
FILE *baseifp;


int i = 0,k=0,flag;
ifp = fopen("myfile", "r");
if(ifp==NULL)
perror("input File is not open");
ofp = fopen("outputfile", "w");
if(ofp==NULL)
perror("output File is not open");


char * line = NULL;
size_t len = 0;
ssize_t read;
/* print read elements on stdout */
while ((read = getline(&line, &len, ifp)) != -1) {
printf("Retrieved line of length %zu :\n", read);
printf("%s", line);
}
if (line)
free(line);
//fclose (ifp);
ifp = freopen("myfile", "r", ifp);

while(fscanf(ifp, "%s", str1)!=EOF)
{
printf ("%s\n",str1);
flag = 0;
array[i] = (char *)malloc (strlen (str1)+1);
strcpy(array[i],str1);

if(i > 0)
for (k = 0; k < i ; k++)
{
if (strcmp(array[k], str1)==0)
{
flag = 1;
break;
}
}
if (flag == 0)
{
fprintf(ofp, "%s ", str1);
}
i++;
memset (str1, 0, 50);
}
printf ("\n");
fclose(ifp);
fclose(ofp);

return 0;
}

================== OUTPUT ========================

[arnuld@dune programs]$ gcc -std=c99 -pedantic -Wall -Wextra remove-
repeated-words.c
remove-repeated-words.c: In function ‘main’:
remove-repeated-words.c:33: error: ‘ssize_t’ undeclared (first use in
this function)
remove-repeated-words.c:33: error: (Each undeclared identifier is
reported only once
remove-repeated-words.c:33: error: for each function it appears in.)
remove-repeated-words.c:33: error: expected ‘;’ before ‘read’
remove-repeated-words.c:35: error: ‘read’ undeclared (first use in this
function)
remove-repeated-words.c:35: warning: implicit declaration of function
‘getline’
remove-repeated-words.c:19: warning: unused variable ‘baseifp’
[arnuld@dune programs]$




Everything is explained in the comments, I have these ideas:

1) First #define __GNU_SOURCE has to go, its not a standard C facility.
2) getline() is not a C function, so I think using fgets() will be a
better idea ?

Will post the code as soon as I rewrite it. Till then can I have your
views ?



--
www.lispmachine.wordpress.com
my email is @ the above blog.

 
Reply With Quote
 
 
 
 
Ben Bacarisse
Guest
Posts: n/a
 
      09-18-2009
arnuld <(E-Mail Removed)> writes:
<snip>
>
> Everything is explained in the comments, I have these ideas:
>
> 1) First #define __GNU_SOURCE has to go, its not a standard C facility.
> 2) getline() is not a C function, so I think using fgets() will be a
> better idea ?


The program doesn't "use" getline. The call is pointless and looks
like a left-over from some previous version.

> Will post the code as soon as I rewrite it. Till then can I have your
> views ?


I think "re-write" is the wrong word. Just start again since almost
nothing is worth preserving. If the author had posted here, I'd would
be *much* more encouraging since it looks like an honest attempt by a
beginner, but there is not mot much point in your trying to "improve"
it.

--
Ben.
 
Reply With Quote
 
 
 
 
user923005
Guest
Posts: n/a
 
      09-18-2009
On Sep 18, 12:46*am, arnuld <(E-Mail Removed)> wrote:
> /* A C program that reads a file and copies the contents to a new file
> while discarding all the repeated words.
> ** Written by one of my friends, posted by me on CLC for constructive
> criticism. I dont' think its a standard
> ** C program, hence I posted it here to make it one
> **
> ** VERSION 0.0
> **
> **/
>
> #define __GNU__SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> int main ()
> {
> * * * * char str1[50] = {0};
> * * * * char *array[100];
> * * * * FILE *ifp,*ofp;
> * * * * FILE *baseifp;
>
> * * * * int i = 0,k=0,flag;
> * * * * ifp = fopen("myfile", "r");
> * * * * if(ifp==NULL)
> * * * * * * * * perror("input File is not open");
> * * * * ofp = fopen("outputfile", "w");
> * * * * if(ofp==NULL)
> * * * * * * * * perror("output File is not open");
>
> * * * * char * line = NULL;
> * * * * size_t len = 0;
> * * * * ssize_t read;
> * * * * /* print read elements on stdout */
> * * * * while ((read = getline(&line, &len, ifp)) != -1) {
> * * * * * * * * printf("Retrieved line of length %zu :\n", read);
> * * * * * * * * printf("%s", line);
> * * * * }
> * * * * if (line)
> * * * * * * * * free(line);
> * * * * //fclose (ifp);
> * * * * ifp = freopen("myfile", "r", ifp);
>
> * * * * while(fscanf(ifp, "%s", str1)!=EOF)
> * * * * {
> * * * * * * * * printf ("%s\n",str1);
> * * * * * * * * flag = 0;
> * * * * * * * * array[i] = (char *)malloc (strlen (str1)+1);
> * * * * * * * * strcpy(array[i],str1);
>
> * * * * * * * * if(i > 0)
> * * * * * * * * * * * * for (k = 0; k < i ; k++)
> * * * * * * * * * * * * {
> * * * * * * * * * * * * * * * * if (strcmp(array[k], str1)==0)
> * * * * * * * * * * * * * * * * {
> * * * * * * * * * * * * * * * * * * * * flag = 1;
> * * * * * * * * * * * * * * * * * * * * break;
> * * * * * * * * * * * * * * * * }
> * * * * * * * * * * * * }
> * * * * * * * * if (flag == 0)
> * * * * * * * * {
> * * * * * * * * * * * * fprintf(ofp, "%s ", str1);
> * * * * * * * * }
> * * * * * * * * i++;
> * * * * * * * * memset (str1, 0, 50);
> * * * * }
> * * * * printf ("\n");
> * * * * fclose(ifp);
> * * * * fclose(ofp);
>
> * * * * return 0;
>
> }
>
> ================== OUTPUT ========================
>
> [arnuld@dune programs]$ gcc -std=c99 -pedantic -Wall -Wextra remove-
> repeated-words.c
> remove-repeated-words.c: In function main:
> remove-repeated-words.c:33: error: ssize_t undeclared (first use in
> this function)
> remove-repeated-words.c:33: error: (Each undeclared identifier is
> reported only once
> remove-repeated-words.c:33: error: for each function it appears in.)
> remove-repeated-words.c:33: error: expected ; before read
> remove-repeated-words.c:35: error: read undeclared (first use in this
> function)
> remove-repeated-words.c:35: warning: implicit declaration of function
> getline
> remove-repeated-words.c:19: warning: unused variable baseifp
> [arnuld@dune programs]$
>
> Everything is explained in the comments, I have these ideas:
>
> 1) First #define __GNU_SOURCE has to go, its not a standard C facility.
> 2) getline() is not a C function, so I think using fgets() will be a
> better idea ?
>
> Will post the code as soon as I rewrite it. Till then can I have your
> views ?
>
> --www.lispmachine.wordpress.com
> my email is @ the above blog.


I would rewrite it from scratch, using fgets() and strtok().

The definition is unclear about repeated words.
Does the program need to understand punctuation and capitalization?
Is the goal to actually create a dictionary of unique words?

If it is to be something akin to a spell checker, but having the
function of duplicate word detection, then it is really a very
difficult problem.
And it probably shouldn't always do what is requested. For instance
(from a Monty Python Script):
John: "Oh Marsha, I could make a fool of myself!"
Marsha: "Oh yes, John... Do! Do!"
<John puts on gag glasses with funny nose and moustache attached>

So, my two cents:
1. Making a dictionary of unique words from a file is easy.
2. Removing duplicate words from a file ignoring case and punctuation
is much harder.
3. Actual correction of English text so that the intent is preserved
is an incredibly difficult problem.

In any case, the above attempt accomplishes none of the above and
should be re-written from scratch.
IMO-YMMV.
 
Reply With Quote
 
user923005
Guest
Posts: n/a
 
      09-18-2009
On Sep 18, 2:57*pm, user923005 <(E-Mail Removed)> wrote:
> On Sep 18, 12:46*am, arnuld <(E-Mail Removed)> wrote:
>
>
>
>
>
> > /* A C program that reads a file and copies the contents to a new file
> > while discarding all the repeated words.
> > ** Written by one of my friends, posted by me on CLC for constructive
> > criticism. I dont' think its a standard
> > ** C program, hence I posted it here to make it one
> > **
> > ** VERSION 0.0
> > **
> > **/

>
> > #define __GNU__SOURCE
> > #include <stdio.h>
> > #include <stdlib.h>
> > #include <string.h>
> > int main ()
> > {
> > * * * * char str1[50] = {0};
> > * * * * char *array[100];
> > * * * * FILE *ifp,*ofp;
> > * * * * FILE *baseifp;

>
> > * * * * int i = 0,k=0,flag;
> > * * * * ifp = fopen("myfile", "r");
> > * * * * if(ifp==NULL)
> > * * * * * * * * perror("input File is not open");
> > * * * * ofp = fopen("outputfile", "w");
> > * * * * if(ofp==NULL)
> > * * * * * * * * perror("output File is not open");

>
> > * * * * char * line = NULL;
> > * * * * size_t len = 0;
> > * * * * ssize_t read;
> > * * * * /* print read elements on stdout */
> > * * * * while ((read = getline(&line, &len, ifp)) != -1) {
> > * * * * * * * * printf("Retrieved line of length %zu :\n", read);
> > * * * * * * * * printf("%s", line);
> > * * * * }
> > * * * * if (line)
> > * * * * * * * * free(line);
> > * * * * //fclose (ifp);
> > * * * * ifp = freopen("myfile", "r", ifp);

>
> > * * * * while(fscanf(ifp, "%s", str1)!=EOF)
> > * * * * {
> > * * * * * * * * printf ("%s\n",str1);
> > * * * * * * * * flag = 0;
> > * * * * * * * * array[i] = (char *)malloc (strlen (str1)+1);
> > * * * * * * * * strcpy(array[i],str1);

>
> > * * * * * * * * if(i > 0)
> > * * * * * * * * * * * * for (k = 0; k < i ; k++)
> > * * * * * * * * * * * * {
> > * * * * * * * * * * * * * * * * if (strcmp(array[k], str1)==0)
> > * * * * * * * * * * * * * * * * {
> > * * * * * * * * * * * * * * * * * * * * flag = 1;
> > * * * * * * * * * * * * * * * * * * * * break;
> > * * * * * * * * * * * * * * * * }
> > * * * * * * * * * * * * }
> > * * * * * * * * if (flag == 0)
> > * * * * * * * * {
> > * * * * * * * * * * * * fprintf(ofp, "%s ", str1);
> > * * * * * * * * }
> > * * * * * * * * i++;
> > * * * * * * * * memset (str1, 0, 50);
> > * * * * }
> > * * * * printf ("\n");
> > * * * * fclose(ifp);
> > * * * * fclose(ofp);

>
> > * * * * return 0;

>
> > }

>
> > ================== OUTPUT ========================

>
> > [arnuld@dune programs]$ gcc -std=c99 -pedantic -Wall -Wextra remove-
> > repeated-words.c
> > remove-repeated-words.c: In function main:
> > remove-repeated-words.c:33: error: ssize_t undeclared (first use in
> > this function)
> > remove-repeated-words.c:33: error: (Each undeclared identifier is
> > reported only once
> > remove-repeated-words.c:33: error: for each function it appears in.)
> > remove-repeated-words.c:33: error: expected ; before read
> > remove-repeated-words.c:35: error: read undeclared (first use in this
> > function)
> > remove-repeated-words.c:35: warning: implicit declaration of function
> > getline
> > remove-repeated-words.c:19: warning: unused variable baseifp
> > [arnuld@dune programs]$

>
> > Everything is explained in the comments, I have these ideas:

>
> > 1) First #define __GNU_SOURCE has to go, its not a standard C facility.
> > 2) getline() is not a C function, so I think using fgets() will be a
> > better idea ?

>
> > Will post the code as soon as I rewrite it. Till then can I have your
> > views ?

>
> > --www.lispmachine.wordpress.com
> > my email is @ the above blog.

>
> I would rewrite it from scratch, using fgets() and strtok().
>
> The definition is unclear about repeated words.
> Does the program need to understand punctuation and capitalization?
> Is the goal to actually create a dictionary of unique words?
>
> If it is to be something akin to a spell checker, but having the
> function of duplicate word detection, then it is really a very
> difficult problem.
> And it probably shouldn't always do what is requested. *For instance
> (from a Monty Python Script):
> John: "Oh Marsha, I could make a fool of myself!"
> Marsha: "Oh yes, John... Do! Do!"
> <John puts on gag glasses with funny nose and moustache attached>
>
> So, my two cents:
> 1. *Making a dictionary of unique words from a file is easy.
> 2. *Removing duplicate words from a file ignoring case and punctuation
> is much harder.
> 3. *Actual correction of English text so that the intent is preserved
> is an incredibly difficult problem.
>
> In any case, the above attempt accomplishes none of the above and
> should be re-written from scratch.
> IMO-YMMV.


Maybe something like this:

/*
Purpose:
Primitive program to detect and remove repeated words.
It does not understand hyphenated continuations.
It does not understand capitalization.
It does not understand punctuation.
It does not understand repetition for empahsis.
It's dumb as a box of hammers.

Limits:
It won't work with lines or words bigger than 64K.

Side effects:
It strips out punctuation.
It turns all white space into plain space chars.
It turns all words into lower case words.

Notes:
Use at your own peril.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

static char string[65535];
static char save_token_string[65535];

void clean_string(char *s)
{
while (*s) {
if (ispunct(*s)) *s = ' ';
else if (isspace(*s)) *s = ' ';
else if (isupper(*s)) *s = (char) tolower(*s);
s++;
}
}

int main(void)
{
char *token = 0;
const char *previous_token = "";
char *data;
while (data = fgets(string, sizeof string, stdin)) {

clean_string(data);
token = strtok(string, " ");
while (token != NULL) {
if (strcmp(token, previous_token) != 0)
printf("%s ", token);
strcpy(save_token_string, token);
previous_token = save_token_string;
token = strtok(NULL, " ");
}
}
return 0;
}
/*
Input file:
C:\tmp>type pitts.dat
Paris in the
the Spring.

Output:
paris in the spring
*/

 
Reply With Quote
 
 
 
Reply

Thread Tools

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

BB code is On
Smilies are On
[IMG] code is On
HTML code is Off
Trackbacks are On
Pingbacks are On
Refbacks are Off


Similar Threads
Thread Thread Starter Forum Replies Last Post
Extracting repeated words candide Python 2 04-02-2011 01:18 PM
Replace stop words (remove words from a string) BerlinBrown Python 6 01-17-2008 02:37 PM
counting repeated words in input arnuld C++ 10 08-03-2007 02:58 PM
Non-noise words are incorrectly recognised as noise words. Peter Striman ASP .Net 1 08-23-2005 01:26 PM
Finding repeated words in text documents: what Algorithm ? Daniele Menozzi Java 9 07-18-2005 06:31 AM



Advertisments