Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
extractor
gesa
Commits
3dcf8dc5
Commit
3dcf8dc5
authored
Jan 09, 2018
by
jkvis
Browse files
gesa library up to date with first version
parent
a8ea503a
Changes
2
Hide whitespace changes
Inline
Side-by-side
include/gesa.h
View file @
3dcf8dc5
...
...
@@ -22,14 +22,14 @@ static int const GESA_DESTROY_STRINGS = 1;
typedef
struct
{
size_t
n
;
// the number of strings
GESA_char_t
const
**
string
;
// an array of n pointers to strings
size_t
*
length
;
// an array of the lengths of the strings
size_t
total
;
// the total length of the suffix array
GESA_index_t
*
sa
;
// the suffix array
GESA_index_t
*
lcp
;
// the longest common prefix array
GESA_index_t
*
da
;
// the document array: which suffix comes
// from which string
size_t
n
;
// the number of strings
GESA_char_t
const
**
string
;
// an array of n pointers to strings
size_t
*
length
;
// an array of the lengths of the strings
size_t
total
;
// the total length of the suffix array
GESA_index_t
*
sa
;
// the suffix array
GESA_index_t
*
lcp
;
// the longest common prefix array
GESA_index_t
*
da
;
// the document array: which suffix comes
// from which string
}
GESA
;
// GESA
...
...
@@ -39,13 +39,38 @@ int GESA_create(GESA* const gesa,
GESA_char_t
const
*
const
string
,
size_t
const
length
);
// All created GESAs should be destroyed. Normally, the GESA does not
// have ownership of the strings. In case it does (e.g., creation via
// deserialization) supply the GESA_DESTROY_STRINGS flag.
void
GESA_destroy
(
GESA
*
const
gesa
,
int
const
destroy_strings
);
// Merges two GESAs into one
// On success it returns 0
int
GESA_merge
(
GESA
*
const
gesa
,
GESA
const
*
const
gesa_0
,
GESA
const
*
const
gesa_1
);
// WARNING: (de-)serialization is *NOT* portable across architectures
// Serializes a GESA including a copy of the strings
void
GESA_serialize
(
GESA
const
*
const
gesa
,
FILE
*
stream
);
// Deserizalizes a GESA including newly allocated strings that should be
// freed explicitly, e.g., supply the GESA_DESTROY_STRINGS flag to the
// GESA_destroy function.
// On success it returns 0
int
GESA_deserialize
(
GESA
*
const
gesa
,
FILE
*
stream
);
// Calculates the longest common substring
GESA_index_t
GESA_lcs
(
GESA_index_t
*
document
,
GESA_index_t
*
index
,
GESA
const
*
const
gesa
);
// Prints a GESA for debug purposes
void
GESA_print
(
GESA
const
*
const
gesa
,
FILE
*
stream
);
#if defined(__cplusplus)
}
// extern "C"
...
...
src/gesa.c
View file @
3dcf8dc5
...
...
@@ -22,7 +22,7 @@ int GESA_create(GESA* const gesa,
gesa
->
string
[
0
]
=
string
;
gesa
->
length
[
0
]
=
length
;
gesa
->
total
=
length
;
gesa
->
sa
=
malloc
((
length
+
1
)
*
sizeof
(
*
gesa
->
sa
));
// sais-lite-lcp specific
gesa
->
sa
=
malloc
((
length
+
1
)
*
sizeof
(
*
gesa
->
sa
));
//
+1 is
sais-lite-lcp specific
gesa
->
lcp
=
malloc
(
length
*
sizeof
(
*
gesa
->
lcp
));
gesa
->
da
=
malloc
(
length
*
sizeof
(
*
gesa
->
da
));
if
(
gesa
->
sa
==
NULL
||
gesa
->
lcp
==
NULL
||
gesa
->
da
==
NULL
)
...
...
@@ -55,3 +55,314 @@ void GESA_destroy(GESA* const gesa,
free
(
gesa
->
lcp
);
free
(
gesa
->
da
);
}
// GESA_destroy
static
inline
void
compare
(
GESA_index_t
*
const
q
,
size_t
*
const
s_new
,
GESA
const
*
const
gesa
[],
size_t
const
j
[])
{
while
(
gesa
[
0
]
->
string
[
gesa
[
0
]
->
da
[
j
[
0
]]][
gesa
[
0
]
->
sa
[
j
[
0
]]
+
*
q
]
!=
'\0'
&&
gesa
[
1
]
->
string
[
gesa
[
1
]
->
da
[
j
[
1
]]][
gesa
[
1
]
->
sa
[
j
[
1
]]
+
*
q
]
!=
'\0'
&&
gesa
[
0
]
->
string
[
gesa
[
0
]
->
da
[
j
[
0
]]][
gesa
[
0
]
->
sa
[
j
[
0
]]
+
*
q
]
==
gesa
[
1
]
->
string
[
gesa
[
1
]
->
da
[
j
[
1
]]][
gesa
[
1
]
->
sa
[
j
[
1
]]
+
*
q
])
{
*
q
+=
1
;
}
// while
if
(
gesa
[
0
]
->
string
[
gesa
[
0
]
->
da
[
j
[
0
]]][
gesa
[
0
]
->
sa
[
j
[
0
]]
+
*
q
]
<=
gesa
[
1
]
->
string
[
gesa
[
1
]
->
da
[
j
[
1
]]][
gesa
[
1
]
->
sa
[
j
[
1
]]
+
*
q
])
{
*
s_new
=
0
;
return
;
}
// if
*
s_new
=
1
;
}
// compare
static
inline
void
swap
(
size_t
*
const
a
,
size_t
*
const
b
)
{
size_t
const
temp
=
*
a
;
*
a
=
*
b
;
*
b
=
temp
;
}
// swap
int
GESA_merge
(
GESA
*
const
merged
,
GESA
const
*
const
gesa_0
,
GESA
const
*
const
gesa_1
)
{
GESA
const
*
const
gesa
[
2
]
=
{
gesa_0
,
gesa_1
};
// gesa
merged
->
n
=
gesa
[
0
]
->
n
+
gesa
[
1
]
->
n
;
merged
->
string
=
malloc
(
merged
->
n
*
sizeof
(
*
merged
->
string
));
merged
->
length
=
malloc
(
merged
->
n
*
sizeof
(
*
merged
->
length
));
if
(
merged
->
string
==
NULL
||
merged
->
length
==
NULL
)
{
free
(
merged
->
string
);
free
(
merged
->
length
);
return
1
;
// memory allocation failed
}
// if
for
(
size_t
i
=
0
;
i
<
gesa
[
0
]
->
n
;
++
i
)
{
merged
->
string
[
i
]
=
gesa
[
0
]
->
string
[
i
];
merged
->
length
[
i
]
=
gesa
[
0
]
->
length
[
i
];
}
// for
for
(
size_t
i
=
0
;
i
<
gesa
[
1
]
->
n
;
++
i
)
{
merged
->
string
[
i
+
gesa
[
0
]
->
n
]
=
gesa
[
1
]
->
string
[
i
];
merged
->
length
[
i
+
gesa
[
0
]
->
n
]
=
gesa
[
1
]
->
length
[
i
];
}
// for
merged
->
total
=
gesa
[
0
]
->
total
+
gesa
[
1
]
->
total
;
merged
->
sa
=
malloc
(
merged
->
total
*
sizeof
(
*
merged
->
sa
));
merged
->
lcp
=
malloc
(
merged
->
total
*
sizeof
(
*
merged
->
lcp
));
merged
->
da
=
malloc
(
merged
->
total
*
sizeof
(
*
merged
->
da
));
if
(
merged
->
sa
==
NULL
||
merged
->
lcp
==
NULL
||
merged
->
da
==
NULL
)
{
GESA_destroy
(
merged
,
GESA_KEEP_STRINGS
);
return
1
;
// memory allocation failed
}
// if
for
(
size_t
i
=
0
;
i
<
gesa
[
0
]
->
n
;
++
i
)
{
merged
->
sa
[
i
]
=
gesa
[
0
]
->
sa
[
i
];
merged
->
lcp
[
i
]
=
0
;
merged
->
da
[
i
]
=
gesa
[
0
]
->
da
[
i
];
}
// for
for
(
size_t
i
=
0
;
i
<
gesa
[
1
]
->
n
;
++
i
)
{
merged
->
sa
[
i
+
gesa
[
0
]
->
n
]
=
gesa
[
1
]
->
sa
[
i
];
merged
->
lcp
[
i
+
gesa
[
0
]
->
n
]
=
0
;
merged
->
da
[
i
+
gesa
[
0
]
->
n
]
=
gesa
[
1
]
->
da
[
i
]
+
(
GESA_index_t
)
gesa
[
0
]
->
n
;
}
// for
size_t
i
=
merged
->
n
;
size_t
j
[
2
]
=
{
gesa
[
0
]
->
n
,
gesa
[
1
]
->
n
};
GESA_index_t
q
=
0
;
size_t
s
=
1
;
size_t
s_bar
=
0
;
while
(
i
<
merged
->
total
)
{
GESA_index_t
const
q_old
=
q
;
size_t
s_new
;
if
(
j
[
0
]
<
gesa
[
0
]
->
total
&&
j
[
1
]
<
gesa
[
1
]
->
total
)
{
compare
(
&
q
,
&
s_new
,
gesa
,
j
);
}
// if
else
{
q
=
-
1
;
s_new
=
s_bar
;
}
// else
if
(
s_new
==
s
)
{
merged
->
lcp
[
i
]
=
gesa
[
s
]
->
lcp
[
j
[
s
]];
}
// if
else
{
merged
->
lcp
[
i
]
=
q_old
;
swap
(
&
s_bar
,
&
s
);
}
// else
merged
->
da
[
i
]
=
gesa
[
s
]
->
da
[
j
[
s
]];
if
(
s
>
0
)
{
merged
->
da
[
i
]
+=
(
GESA_index_t
)
gesa
[
0
]
->
n
;
}
// if
merged
->
sa
[
i
]
=
gesa
[
s
]
->
sa
[
j
[
s
]];
i
+=
1
;
j
[
s
]
+=
1
;
while
(
j
[
s
]
<
gesa
[
s
]
->
total
&&
gesa
[
s
]
->
lcp
[
j
[
s
]]
!=
q
)
{
if
(
gesa
[
s
]
->
lcp
[
j
[
s
]]
>
q
)
{
merged
->
lcp
[
i
]
=
gesa
[
s
]
->
lcp
[
j
[
s
]];
}
// if
else
{
merged
->
lcp
[
i
]
=
q
;
q
=
gesa
[
s
]
->
lcp
[
j
[
s
]];
swap
(
&
s_bar
,
&
s
);
}
// else
merged
->
da
[
i
]
=
gesa
[
s
]
->
da
[
j
[
s
]];
if
(
s
>
0
)
{
merged
->
da
[
i
]
+=
(
GESA_index_t
)
gesa
[
0
]
->
n
;
}
// if
merged
->
sa
[
i
]
=
gesa
[
s
]
->
sa
[
j
[
s
]];
i
+=
1
;
j
[
s
]
+=
1
;
}
// while
}
// while
return
0
;
}
// GESA_merge
// fwrite return values are ignored
void
GESA_serialize
(
GESA
const
*
const
gesa
,
FILE
*
stream
)
{
fwrite
(
&
gesa
->
n
,
sizeof
(
gesa
->
n
),
1
,
stream
);
for
(
size_t
i
=
0
;
i
<
gesa
->
n
;
++
i
)
{
fwrite
(
&
gesa
->
length
[
i
],
sizeof
(
gesa
->
length
[
i
]),
1
,
stream
);
fwrite
(
gesa
->
string
[
i
],
sizeof
(
*
gesa
->
string
[
i
]),
gesa
->
length
[
i
],
stream
);
}
// for
fwrite
(
gesa
->
sa
,
sizeof
(
*
gesa
->
sa
),
gesa
->
total
,
stream
);
fwrite
(
gesa
->
lcp
,
sizeof
(
*
gesa
->
lcp
),
gesa
->
total
,
stream
);
fwrite
(
gesa
->
da
,
sizeof
(
*
gesa
->
da
),
gesa
->
total
,
stream
);
}
// GESA_serialize
int
GESA_deserialize
(
GESA
*
const
gesa
,
FILE
*
stream
)
{
if
(
fread
(
&
gesa
->
n
,
sizeof
(
gesa
->
n
),
1
,
stream
)
!=
1
)
{
return
1
;
// fread failed
}
// if
gesa
->
string
=
malloc
(
gesa
->
n
*
sizeof
(
*
gesa
->
string
));
gesa
->
length
=
malloc
(
gesa
->
n
*
sizeof
(
*
gesa
->
length
));
if
(
gesa
->
string
==
NULL
||
gesa
->
length
==
NULL
)
{
free
(
gesa
->
string
);
free
(
gesa
->
length
);
return
1
;
// memory allocation failed
}
// if
gesa
->
total
=
0
;
for
(
size_t
i
=
0
;
i
<
gesa
->
n
;
++
i
)
{
if
(
fread
(
&
gesa
->
length
[
i
],
sizeof
(
gesa
->
length
[
i
]),
1
,
stream
)
!=
1
)
{
free
(
gesa
->
string
);
free
(
gesa
->
length
);
return
1
;
// fread failed
}
// if
gesa
->
string
[
i
]
=
malloc
(
gesa
->
length
[
i
]
*
sizeof
(
*
gesa
->
string
[
i
]));
if
(
gesa
->
string
[
i
]
==
NULL
)
{
for
(
size_t
j
=
0
;
j
<=
i
;
++
j
)
{
free
((
void
*
)
gesa
->
string
[
j
]);
}
// for
free
(
gesa
->
string
);
free
(
gesa
->
length
);
return
1
;
// memory allocation failed
}
// if
if
(
fread
((
void
*
)
gesa
->
string
[
i
],
sizeof
(
*
gesa
->
string
[
i
]),
gesa
->
length
[
i
],
stream
)
!=
gesa
->
length
[
i
])
{
for
(
size_t
j
=
0
;
j
<=
i
;
++
j
)
{
free
((
void
*
)
gesa
->
string
[
j
]);
}
// for
free
(
gesa
->
string
);
free
(
gesa
->
length
);
return
1
;
// fread failed
}
// if
gesa
->
total
+=
gesa
->
length
[
i
];
}
// for
gesa
->
sa
=
malloc
(
gesa
->
total
*
sizeof
(
*
gesa
->
sa
));
gesa
->
lcp
=
malloc
(
gesa
->
total
*
sizeof
(
*
gesa
->
lcp
));
gesa
->
da
=
malloc
(
gesa
->
total
*
sizeof
(
*
gesa
->
da
));
if
(
gesa
->
sa
==
NULL
||
gesa
->
lcp
==
NULL
||
gesa
->
da
==
NULL
)
{
GESA_destroy
(
gesa
,
GESA_DESTROY_STRINGS
);
return
1
;
// memory allocation failed
}
// if
if
(
fread
(
gesa
->
sa
,
sizeof
(
*
gesa
->
sa
),
gesa
->
total
,
stream
)
!=
gesa
->
total
)
{
GESA_destroy
(
gesa
,
GESA_DESTROY_STRINGS
);
return
1
;
// fread failed
}
// if
if
(
fread
(
gesa
->
lcp
,
sizeof
(
*
gesa
->
lcp
),
gesa
->
total
,
stream
)
!=
gesa
->
total
)
{
GESA_destroy
(
gesa
,
GESA_DESTROY_STRINGS
);
return
1
;
// fread failed
}
// if
if
(
fread
(
gesa
->
da
,
sizeof
(
*
gesa
->
da
),
gesa
->
total
,
stream
)
!=
gesa
->
total
)
{
GESA_destroy
(
gesa
,
GESA_DESTROY_STRINGS
);
return
1
;
// fread failed
}
// if
return
0
;
}
// GESA_deserialize
GESA_index_t
GESA_lcs
(
GESA_index_t
*
document
,
GESA_index_t
*
index
,
GESA
const
*
const
gesa
)
{
GESA_index_t
longest
=
0
;
for
(
size_t
i
=
1
;
i
<
gesa
->
total
;
++
i
)
{
if
(
gesa
->
da
[
i
-
1
]
!=
gesa
->
da
[
i
]
&&
gesa
->
lcp
[
i
]
>
longest
)
{
longest
=
gesa
->
lcp
[
i
];
*
document
=
gesa
->
da
[
i
];
*
index
=
gesa
->
sa
[
i
];
}
// if
}
// for
return
longest
;
}
// GESA_GESA
#if !defined(NDEBUG)
static
size_t
print_truncated
(
GESA_char_t
const
*
const
string
,
size_t
const
start
,
size_t
const
end
,
size_t
const
length
,
FILE
*
stream
)
{
if
(
end
-
start
<
length
+
2
||
length
<
4
)
{
return
fwrite
(
string
+
start
,
sizeof
(
*
string
),
end
-
start
,
stream
);
}
// if
return
fwrite
(
string
+
start
,
sizeof
(
*
string
),
length
/
2
-
1
,
stream
)
+
fputs
(
".."
,
stream
)
+
fwrite
(
string
+
end
-
(
length
/
2
),
sizeof
(
*
string
),
length
/
2
-
1
,
stream
);
}
// print_trunctaced
#endif
// fprintf return values are ignored as this is for debugging only
void
GESA_print
(
GESA
const
*
const
gesa
,
FILE
*
stream
)
{
(
void
)
gesa
;
(
void
)
stream
;
#if !defined(NDEBUG)
fprintf
(
stream
,
"GESA for %ld string(s):
\n
i length string
\n
"
,
gesa
->
n
);
for
(
size_t
i
=
0
;
i
<
gesa
->
n
;
++
i
)
{
fprintf
(
stream
,
"%5ld %5ld "
,
i
,
gesa
->
length
[
i
]);
print_truncated
(
gesa
->
string
[
i
],
0
,
gesa
->
length
[
i
],
40
,
stream
);
fprintf
(
stream
,
"
\n
"
);
}
// for
fprintf
(
stream
,
"total: %5ld
\n\n
"
,
gesa
->
total
);
fprintf
(
stream
,
" i da sa lcp suffix
\n
"
);
for
(
size_t
i
=
0
;
i
<
gesa
->
total
;
++
i
)
{
fprintf
(
stream
,
"%5ld %5d %5d %5d "
,
i
,
gesa
->
da
[
i
],
gesa
->
sa
[
i
],
gesa
->
lcp
[
i
]);
print_truncated
(
gesa
->
string
[
gesa
->
da
[
i
]],
gesa
->
sa
[
i
],
gesa
->
length
[
gesa
->
da
[
i
]],
40
,
stream
);
fprintf
(
stream
,
"
\n
"
);
}
// for
fprintf
(
stream
,
"
\n
"
);
#endif
}
// GESA_print
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment