Commit 3dcf8dc5 authored by jkvis's avatar jkvis

gesa library up to date with first version

parent a8ea503a
......@@ -22,14 +22,14 @@ static int const GESA_DESTROY_STRINGS = 1;
typedef struct
{
size_t n; // the number of strings
GESA_char_t const** string; // an array of n pointers to strings
size_t* length; // an array of the lengths of the strings
size_t total; // the total length of the suffix array
GESA_index_t* sa; // the suffix array
GESA_index_t* lcp; // the longest common prefix array
GESA_index_t* da; // the document array: which suffix comes
// from which string
size_t n; // the number of strings
GESA_char_t const** string; // an array of n pointers to strings
size_t* length; // an array of the lengths of the strings
size_t total; // the total length of the suffix array
GESA_index_t* sa; // the suffix array
GESA_index_t* lcp; // the longest common prefix array
GESA_index_t* da; // the document array: which suffix comes
// from which string
} GESA; // GESA
......@@ -39,13 +39,38 @@ int GESA_create(GESA* const gesa,
GESA_char_t const* const string,
size_t const length);
// All created GESAs should be destroyed. Normally, the GESA does not
// have ownership of the strings. In case it does (e.g., creation via
// deserialization) supply the GESA_DESTROY_STRINGS flag.
void GESA_destroy(GESA* const gesa,
int const destroy_strings);
// Merges two GESAs into one
// On success it returns 0
int GESA_merge(GESA* const gesa,
GESA const* const gesa_0,
GESA const* const gesa_1);
// WARNING: (de-)serialization is *NOT* portable across architectures
// Serializes a GESA including a copy of the strings
void GESA_serialize(GESA const* const gesa,
FILE* stream);
// Deserizalizes a GESA including newly allocated strings that should be
// freed explicitly, e.g., supply the GESA_DESTROY_STRINGS flag to the
// GESA_destroy function.
// On success it returns 0
int GESA_deserialize(GESA* const gesa,
FILE* stream);
// Calculates the longest common substring
GESA_index_t GESA_lcs(GESA_index_t* document,
GESA_index_t* index,
GESA const* const gesa);
// Prints a GESA for debug purposes
void GESA_print(GESA const* const gesa,
FILE* stream);
#if defined(__cplusplus)
} // extern "C"
......
......@@ -22,7 +22,7 @@ int GESA_create(GESA* const gesa,
gesa->string[0] = string;
gesa->length[0] = length;
gesa->total = length;
gesa->sa = malloc((length + 1) * sizeof(*gesa->sa)); // sais-lite-lcp specific
gesa->sa = malloc((length + 1) * sizeof(*gesa->sa)); // +1 is sais-lite-lcp specific
gesa->lcp = malloc(length * sizeof(*gesa->lcp));
gesa->da = malloc(length * sizeof(*gesa->da));
if (gesa->sa == NULL || gesa->lcp == NULL || gesa->da == NULL)
......@@ -55,3 +55,314 @@ void GESA_destroy(GESA* const gesa,
free(gesa->lcp);
free(gesa->da);
} // GESA_destroy
static inline void compare(GESA_index_t* const q,
size_t* const s_new,
GESA const* const gesa[],
size_t const j[])
{
while (gesa[0]->string[gesa[0]->da[j[0]]][gesa[0]->sa[j[0]] + *q] != '\0' &&
gesa[1]->string[gesa[1]->da[j[1]]][gesa[1]->sa[j[1]] + *q] != '\0' &&
gesa[0]->string[gesa[0]->da[j[0]]][gesa[0]->sa[j[0]] + *q] ==
gesa[1]->string[gesa[1]->da[j[1]]][gesa[1]->sa[j[1]] + *q])
{
*q += 1;
} // while
if (gesa[0]->string[gesa[0]->da[j[0]]][gesa[0]->sa[j[0]] + *q] <=
gesa[1]->string[gesa[1]->da[j[1]]][gesa[1]->sa[j[1]] + *q])
{
*s_new = 0;
return;
} // if
*s_new = 1;
} // compare
static inline void swap(size_t* const a, size_t* const b)
{
size_t const temp = *a;
*a = *b;
*b = temp;
} // swap
int GESA_merge(GESA* const merged,
GESA const* const gesa_0,
GESA const* const gesa_1)
{
GESA const* const gesa[2] =
{
gesa_0,
gesa_1
}; // gesa
merged->n = gesa[0]->n + gesa[1]->n;
merged->string = malloc(merged->n * sizeof(*merged->string));
merged->length = malloc(merged->n * sizeof(*merged->length));
if (merged->string == NULL || merged->length == NULL)
{
free(merged->string);
free(merged->length);
return 1; // memory allocation failed
} // if
for (size_t i = 0; i < gesa[0]->n; ++i)
{
merged->string[i] = gesa[0]->string[i];
merged->length[i] = gesa[0]->length[i];
} // for
for (size_t i = 0; i < gesa[1]->n; ++i)
{
merged->string[i + gesa[0]->n] = gesa[1]->string[i];
merged->length[i + gesa[0]->n] = gesa[1]->length[i];
} // for
merged->total = gesa[0]->total + gesa[1]->total;
merged->sa = malloc(merged->total * sizeof(*merged->sa));
merged->lcp = malloc(merged->total * sizeof(*merged->lcp));
merged->da = malloc(merged->total * sizeof(*merged->da));
if (merged->sa == NULL || merged->lcp == NULL || merged->da == NULL)
{
GESA_destroy(merged, GESA_KEEP_STRINGS);
return 1; // memory allocation failed
} // if
for (size_t i = 0; i < gesa[0]->n; ++i)
{
merged->sa[i] = gesa[0]->sa[i];
merged->lcp[i] = 0;
merged->da[i] = gesa[0]->da[i];
} // for
for (size_t i = 0; i < gesa[1]->n; ++i)
{
merged->sa [i + gesa[0]->n] = gesa[1]->sa[i];
merged->lcp[i + gesa[0]->n] = 0;
merged->da [i + gesa[0]->n] = gesa[1]->da[i] + (GESA_index_t) gesa[0]->n;
} // for
size_t i = merged->n;
size_t j[2] = {gesa[0]->n, gesa[1]->n};
GESA_index_t q = 0;
size_t s = 1;
size_t s_bar = 0;
while (i < merged->total)
{
GESA_index_t const q_old = q;
size_t s_new;
if (j[0] < gesa[0]->total && j[1] < gesa[1]->total)
{
compare(&q, &s_new, gesa, j);
} // if
else
{
q = -1;
s_new = s_bar;
} // else
if (s_new == s)
{
merged->lcp[i] = gesa[s]->lcp[j[s]];
} // if
else
{
merged->lcp[i] = q_old;
swap(&s_bar, &s);
} // else
merged->da[i] = gesa[s]->da[j[s]];
if (s > 0)
{
merged->da[i] += (GESA_index_t) gesa[0]->n;
} // if
merged->sa[i] = gesa[s]->sa[j[s]];
i += 1;
j[s] += 1;
while (j[s] < gesa[s]->total && gesa[s]->lcp[j[s]] != q)
{
if (gesa[s]->lcp[j[s]] > q)
{
merged->lcp[i] = gesa[s]->lcp[j[s]];
} // if
else
{
merged->lcp[i] = q;
q = gesa[s]->lcp[j[s]];
swap(&s_bar, &s);
} // else
merged->da[i] = gesa[s]->da[j[s]];
if (s > 0)
{
merged->da[i] += (GESA_index_t) gesa[0]->n;
} // if
merged->sa[i] = gesa[s]->sa[j[s]];
i += 1;
j[s] += 1;
} // while
} // while
return 0;
} // GESA_merge
// fwrite return values are ignored
void GESA_serialize(GESA const* const gesa,
FILE* stream)
{
fwrite(&gesa->n, sizeof(gesa->n), 1, stream);
for (size_t i = 0; i < gesa->n; ++i)
{
fwrite(&gesa->length[i], sizeof(gesa->length[i]), 1, stream);
fwrite(gesa->string[i], sizeof(*gesa->string[i]), gesa->length[i], stream);
} // for
fwrite(gesa->sa, sizeof(*gesa->sa), gesa->total, stream);
fwrite(gesa->lcp, sizeof(*gesa->lcp), gesa->total, stream);
fwrite(gesa->da, sizeof(*gesa->da), gesa->total, stream);
} // GESA_serialize
int GESA_deserialize(GESA* const gesa,
FILE* stream)
{
if (fread(&gesa->n, sizeof(gesa->n), 1, stream) != 1)
{
return 1; // fread failed
} // if
gesa->string = malloc(gesa->n * sizeof(*gesa->string));
gesa->length = malloc(gesa->n * sizeof(*gesa->length));
if (gesa->string == NULL || gesa->length == NULL)
{
free(gesa->string);
free(gesa->length);
return 1; // memory allocation failed
} // if
gesa->total = 0;
for (size_t i = 0; i < gesa->n; ++i)
{
if (fread(&gesa->length[i], sizeof(gesa->length[i]), 1, stream) != 1)
{
free(gesa->string);
free(gesa->length);
return 1; // fread failed
} // if
gesa->string[i] = malloc(gesa->length[i] * sizeof(*gesa->string[i]));
if (gesa->string[i] == NULL)
{
for (size_t j = 0; j <= i; ++j)
{
free((void*) gesa->string[j]);
} // for
free(gesa->string);
free(gesa->length);
return 1; // memory allocation failed
} // if
if (fread((void*) gesa->string[i], sizeof(*gesa->string[i]), gesa->length[i], stream) != gesa->length[i])
{
for (size_t j = 0; j <= i; ++j)
{
free((void*) gesa->string[j]);
} // for
free(gesa->string);
free(gesa->length);
return 1; // fread failed
} // if
gesa->total += gesa->length[i];
} // for
gesa->sa = malloc(gesa->total * sizeof(*gesa->sa));
gesa->lcp = malloc(gesa->total * sizeof(*gesa->lcp));
gesa->da = malloc(gesa->total * sizeof(*gesa->da));
if (gesa->sa == NULL || gesa->lcp == NULL || gesa->da == NULL)
{
GESA_destroy(gesa, GESA_DESTROY_STRINGS);
return 1; // memory allocation failed
} // if
if (fread(gesa->sa, sizeof(*gesa->sa), gesa->total, stream) != gesa->total)
{
GESA_destroy(gesa, GESA_DESTROY_STRINGS);
return 1; // fread failed
} // if
if (fread(gesa->lcp, sizeof(*gesa->lcp), gesa->total, stream) != gesa->total)
{
GESA_destroy(gesa, GESA_DESTROY_STRINGS);
return 1; // fread failed
} // if
if (fread(gesa->da, sizeof(*gesa->da), gesa->total, stream) != gesa->total)
{
GESA_destroy(gesa, GESA_DESTROY_STRINGS);
return 1; // fread failed
} // if
return 0;
} // GESA_deserialize
GESA_index_t GESA_lcs(GESA_index_t* document,
GESA_index_t* index,
GESA const* const gesa)
{
GESA_index_t longest = 0;
for (size_t i = 1; i < gesa->total; ++i)
{
if (gesa->da[i - 1] != gesa->da[i] && gesa->lcp[i] > longest)
{
longest = gesa->lcp[i];
*document = gesa->da[i];
*index = gesa->sa[i];
} // if
} // for
return longest;
} // GESA_GESA
#if !defined(NDEBUG)
static size_t print_truncated(GESA_char_t const* const string,
size_t const start,
size_t const end,
size_t const length,
FILE* stream)
{
if (end - start < length + 2 || length < 4)
{
return fwrite(string + start, sizeof(*string), end - start, stream);
} // if
return fwrite(string + start, sizeof(*string), length / 2 - 1, stream) +
fputs("..", stream) +
fwrite(string + end - (length / 2), sizeof(*string), length / 2 - 1, stream);
} // print_trunctaced
#endif
// fprintf return values are ignored as this is for debugging only
void GESA_print(GESA const* const gesa,
FILE* stream)
{
(void) gesa;
(void) stream;
#if !defined(NDEBUG)
fprintf(stream,
"GESA for %ld string(s):\n i length string\n",
gesa->n);
for (size_t i = 0; i < gesa->n; ++i)
{
fprintf(stream, "%5ld %5ld ", i, gesa->length[i]);
print_truncated(gesa->string[i], 0, gesa->length[i], 40, stream);
fprintf(stream, "\n");
} // for
fprintf(stream, "total: %5ld\n\n", gesa->total);
fprintf(stream, " i da sa lcp suffix\n");
for (size_t i = 0; i < gesa->total; ++i)
{
fprintf(stream,
"%5ld %5d %5d %5d ",
i,
gesa->da[i],
gesa->sa[i],
gesa->lcp[i]);
print_truncated(gesa->string[gesa->da[i]], gesa->sa[i], gesa->length[gesa->da[i]], 40, stream);
fprintf(stream, "\n");
} // for
fprintf(stream, "\n");
#endif
} // GESA_print
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment