17e509d50SXin LI /*
27e509d50SXin LI * Copyright (c) Meta Platforms, Inc. and affiliates.
37e509d50SXin LI * All rights reserved.
47e509d50SXin LI *
57e509d50SXin LI * This source code is licensed under both the BSD-style license (found in the
67e509d50SXin LI * LICENSE file in the root directory of this source tree) and the GPLv2 (found
77e509d50SXin LI * in the COPYING file in the root directory of this source tree).
87e509d50SXin LI * You may select, at your option, one of the above-listed licenses.
97e509d50SXin LI */
107e509d50SXin LI
117e509d50SXin LI /* Implementation notes:
127e509d50SXin LI *
137e509d50SXin LI * This is a very simple lorem ipsum generator
147e509d50SXin LI * which features a static list of words
157e509d50SXin LI * and print them one after another randomly
167e509d50SXin LI * with a fake sentence / paragraph structure.
177e509d50SXin LI *
187e509d50SXin LI * The goal is to generate a printable text
197e509d50SXin LI * that can be used to fake a text compression scenario.
207e509d50SXin LI * The resulting compression / ratio curve of the lorem ipsum generator
217e509d50SXin LI * is more satisfying than the previous statistical generator,
227e509d50SXin LI * which was initially designed for entropy compression,
237e509d50SXin LI * and lacks a regularity more representative of text.
247e509d50SXin LI *
257e509d50SXin LI * The compression ratio achievable on the generated lorem ipsum
267e509d50SXin LI * is still a bit too good, presumably because the dictionary is a bit too
277e509d50SXin LI * small. It would be possible to create some more complex scheme, notably by
287e509d50SXin LI * enlarging the dictionary with a word generator, and adding grammatical rules
297e509d50SXin LI * (composition) and syntax rules. But that's probably overkill for the intended
307e509d50SXin LI * goal.
317e509d50SXin LI */
327e509d50SXin LI
337e509d50SXin LI #include "lorem.h"
347e509d50SXin LI #include <assert.h>
357e509d50SXin LI #include <limits.h> /* INT_MAX */
367e509d50SXin LI #include <string.h> /* memcpy */
377e509d50SXin LI
387e509d50SXin LI #define WORD_MAX_SIZE 20
397e509d50SXin LI
407e509d50SXin LI /* Define the word pool */
417e509d50SXin LI static const char* kWords[] = {
427e509d50SXin LI "lorem", "ipsum", "dolor", "sit", "amet",
437e509d50SXin LI "consectetur", "adipiscing", "elit", "sed", "do",
447e509d50SXin LI "eiusmod", "tempor", "incididunt", "ut", "labore",
457e509d50SXin LI "et", "dolore", "magna", "aliqua", "dis",
467e509d50SXin LI "lectus", "vestibulum", "mattis", "ullamcorper", "velit",
477e509d50SXin LI "commodo", "a", "lacus", "arcu", "magnis",
487e509d50SXin LI "parturient", "montes", "nascetur", "ridiculus", "mus",
497e509d50SXin LI "mauris", "nulla", "malesuada", "pellentesque", "eget",
507e509d50SXin LI "gravida", "in", "dictum", "non", "erat",
517e509d50SXin LI "nam", "voluptat", "maecenas", "blandit", "aliquam",
527e509d50SXin LI "etiam", "enim", "lobortis", "scelerisque", "fermentum",
537e509d50SXin LI "dui", "faucibus", "ornare", "at", "elementum",
547e509d50SXin LI "eu", "facilisis", "odio", "morbi", "quis",
557e509d50SXin LI "eros", "donec", "ac", "orci", "purus",
567e509d50SXin LI "turpis", "cursus", "leo", "vel", "porta",
577e509d50SXin LI "consequat", "interdum", "varius", "vulputate", "aliquet",
587e509d50SXin LI "pharetra", "nunc", "auctor", "urna", "id",
597e509d50SXin LI "metus", "viverra", "nibh", "cras", "mi",
607e509d50SXin LI "unde", "omnis", "iste", "natus", "error",
617e509d50SXin LI "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium",
627e509d50SXin LI "totam", "rem", "aperiam", "eaque", "ipsa",
637e509d50SXin LI "quae", "ab", "illo", "inventore", "veritatis",
647e509d50SXin LI "quasi", "architecto", "beatae", "vitae", "dicta",
657e509d50SXin LI "sunt", "explicabo", "nemo", "ipsam", "quia",
667e509d50SXin LI "voluptas", "aspernatur", "aut", "odit", "fugit",
677e509d50SXin LI "consequuntur", "magni", "dolores", "eos", "qui",
687e509d50SXin LI "ratione", "sequi", "nesciunt", "neque", "porro",
697e509d50SXin LI "quisquam", "est", "dolorem", "adipisci", "numquam",
707e509d50SXin LI "eius", "modi", "tempora", "incidunt", "magnam",
717e509d50SXin LI "quaerat", "ad", "minima", "veniam", "nostrum",
727e509d50SXin LI "ullam", "corporis", "suscipit", "laboriosam", "nisi",
737e509d50SXin LI "aliquid", "ex", "ea", "commodi", "consequatur",
747e509d50SXin LI "autem", "eum", "iure", "voluptate", "esse",
757e509d50SXin LI "quam", "nihil", "molestiae", "illum", "fugiat",
767e509d50SXin LI "quo", "pariatur", "vero", "accusamus", "iusto",
777e509d50SXin LI "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum",
787e509d50SXin LI "deleniti", "atque", "corrupti", "quos", "quas",
797e509d50SXin LI "molestias", "excepturi", "sint", "occaecati", "cupiditate",
807e509d50SXin LI "provident", "similique", "culpa", "officia", "deserunt",
817e509d50SXin LI "mollitia", "animi", "laborum", "dolorum", "fuga",
827e509d50SXin LI "harum", "quidem", "rerum", "facilis", "expedita",
837e509d50SXin LI "distinctio", "libero", "tempore", "cum", "soluta",
847e509d50SXin LI "nobis", "eligendi", "optio", "cumque", "impedit",
857e509d50SXin LI "minus", "quod", "maxime", "placeat", "facere",
867e509d50SXin LI "possimus", "assumenda", "repellendus", "temporibus", "quibusdam",
877e509d50SXin LI "officiis", "debitis", "saepe", "eveniet", "voluptates",
887e509d50SXin LI "repudiandae", "recusandae", "itaque", "earum", "hic",
897e509d50SXin LI "tenetur", "sapiente", "delectus", "reiciendis", "cillum",
907e509d50SXin LI "maiores", "alias", "perferendis", "doloribus", "asperiores",
917e509d50SXin LI "repellat", "minim", "nostrud", "exercitation", "ullamco",
927e509d50SXin LI "laboris", "aliquip", "duis", "aute", "irure",
937e509d50SXin LI };
947e509d50SXin LI static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
957e509d50SXin LI
967e509d50SXin LI /* simple 1-dimension distribution, based on word's length, favors small words
977e509d50SXin LI */
987e509d50SXin LI static const int kWeights[] = { 0, 8, 6, 4, 3, 2 };
997e509d50SXin LI static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
1007e509d50SXin LI
1017e509d50SXin LI #define DISTRIB_SIZE_MAX 650
1027e509d50SXin LI static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
1037e509d50SXin LI static unsigned g_distribCount = 0;
1047e509d50SXin LI
countFreqs(const char * words[],size_t nbWords,const int * weights,size_t nbWeights)1057e509d50SXin LI static void countFreqs(
1067e509d50SXin LI const char* words[],
1077e509d50SXin LI size_t nbWords,
1087e509d50SXin LI const int* weights,
1097e509d50SXin LI size_t nbWeights)
1107e509d50SXin LI {
1117e509d50SXin LI unsigned total = 0;
1127e509d50SXin LI size_t w;
1137e509d50SXin LI for (w = 0; w < nbWords; w++) {
1147e509d50SXin LI size_t len = strlen(words[w]);
1157e509d50SXin LI int lmax;
1167e509d50SXin LI if (len >= nbWeights)
1177e509d50SXin LI len = nbWeights - 1;
1187e509d50SXin LI lmax = weights[len];
1197e509d50SXin LI total += (unsigned)lmax;
1207e509d50SXin LI }
1217e509d50SXin LI g_distribCount = total;
1227e509d50SXin LI assert(g_distribCount <= DISTRIB_SIZE_MAX);
1237e509d50SXin LI }
1247e509d50SXin LI
init_word_distrib(const char * words[],size_t nbWords,const int * weights,size_t nbWeights)1257e509d50SXin LI static void init_word_distrib(
1267e509d50SXin LI const char* words[],
1277e509d50SXin LI size_t nbWords,
1287e509d50SXin LI const int* weights,
1297e509d50SXin LI size_t nbWeights)
1307e509d50SXin LI {
1317e509d50SXin LI size_t w, d = 0;
1327e509d50SXin LI countFreqs(words, nbWords, weights, nbWeights);
1337e509d50SXin LI for (w = 0; w < nbWords; w++) {
1347e509d50SXin LI size_t len = strlen(words[w]);
1357e509d50SXin LI int l, lmax;
1367e509d50SXin LI if (len >= nbWeights)
1377e509d50SXin LI len = nbWeights - 1;
1387e509d50SXin LI lmax = weights[len];
1397e509d50SXin LI for (l = 0; l < lmax; l++) {
1407e509d50SXin LI g_distrib[d++] = (int)w;
1417e509d50SXin LI }
1427e509d50SXin LI }
1437e509d50SXin LI }
1447e509d50SXin LI
1457e509d50SXin LI /* Note: this unit only works when invoked sequentially.
1467e509d50SXin LI * No concurrent access is allowed */
1477e509d50SXin LI static char* g_ptr = NULL;
1487e509d50SXin LI static size_t g_nbChars = 0;
1497e509d50SXin LI static size_t g_maxChars = 10000000;
1507e509d50SXin LI static unsigned g_randRoot = 0;
1517e509d50SXin LI
1527e509d50SXin LI #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
LOREM_rand(unsigned range)1537e509d50SXin LI static unsigned LOREM_rand(unsigned range)
1547e509d50SXin LI {
1557e509d50SXin LI static const unsigned prime1 = 2654435761U;
1567e509d50SXin LI static const unsigned prime2 = 2246822519U;
1577e509d50SXin LI unsigned rand32 = g_randRoot;
1587e509d50SXin LI rand32 *= prime1;
1597e509d50SXin LI rand32 ^= prime2;
1607e509d50SXin LI rand32 = RDG_rotl32(rand32, 13);
1617e509d50SXin LI g_randRoot = rand32;
1627e509d50SXin LI return (unsigned)(((unsigned long long)rand32 * range) >> 32);
1637e509d50SXin LI }
1647e509d50SXin LI
writeLastCharacters(void)1657e509d50SXin LI static void writeLastCharacters(void)
1667e509d50SXin LI {
1677e509d50SXin LI size_t lastChars = g_maxChars - g_nbChars;
1687e509d50SXin LI assert(g_maxChars >= g_nbChars);
1697e509d50SXin LI if (lastChars == 0)
1707e509d50SXin LI return;
1717e509d50SXin LI g_ptr[g_nbChars++] = '.';
1727e509d50SXin LI if (lastChars > 2) {
1737e509d50SXin LI memset(g_ptr + g_nbChars, ' ', lastChars - 2);
1747e509d50SXin LI }
1757e509d50SXin LI if (lastChars > 1) {
1767e509d50SXin LI g_ptr[g_maxChars - 1] = '\n';
1777e509d50SXin LI }
1787e509d50SXin LI g_nbChars = g_maxChars;
1797e509d50SXin LI }
1807e509d50SXin LI
generateWord(const char * word,const char * separator,int upCase)1817e509d50SXin LI static void generateWord(const char* word, const char* separator, int upCase)
1827e509d50SXin LI {
1837e509d50SXin LI size_t const len = strlen(word) + strlen(separator);
1847e509d50SXin LI if (g_nbChars + len > g_maxChars) {
1857e509d50SXin LI writeLastCharacters();
1867e509d50SXin LI return;
1877e509d50SXin LI }
1887e509d50SXin LI memcpy(g_ptr + g_nbChars, word, strlen(word));
1897e509d50SXin LI if (upCase) {
1907e509d50SXin LI static const char toUp = 'A' - 'a';
1917e509d50SXin LI g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
1927e509d50SXin LI }
1937e509d50SXin LI g_nbChars += strlen(word);
1947e509d50SXin LI memcpy(g_ptr + g_nbChars, separator, strlen(separator));
1957e509d50SXin LI g_nbChars += strlen(separator);
1967e509d50SXin LI }
1977e509d50SXin LI
about(unsigned target)1987e509d50SXin LI static int about(unsigned target)
1997e509d50SXin LI {
2007e509d50SXin LI return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
2017e509d50SXin LI }
2027e509d50SXin LI
2037e509d50SXin LI /* Function to generate a random sentence */
generateSentence(int nbWords)2047e509d50SXin LI static void generateSentence(int nbWords)
2057e509d50SXin LI {
2067e509d50SXin LI int commaPos = about(9);
2077e509d50SXin LI int comma2 = commaPos + about(7);
2087e509d50SXin LI int qmark = (LOREM_rand(11) == 7);
2097e509d50SXin LI const char* endSep = qmark ? "? " : ". ";
2107e509d50SXin LI int i;
2117e509d50SXin LI for (i = 0; i < nbWords; i++) {
2127e509d50SXin LI int const wordID = g_distrib[LOREM_rand(g_distribCount)];
2137e509d50SXin LI const char* const word = kWords[wordID];
2147e509d50SXin LI const char* sep = " ";
2157e509d50SXin LI if (i == commaPos)
2167e509d50SXin LI sep = ", ";
2177e509d50SXin LI if (i == comma2)
2187e509d50SXin LI sep = ", ";
2197e509d50SXin LI if (i == nbWords - 1)
2207e509d50SXin LI sep = endSep;
2217e509d50SXin LI generateWord(word, sep, i == 0);
2227e509d50SXin LI }
2237e509d50SXin LI }
2247e509d50SXin LI
generateParagraph(int nbSentences)2257e509d50SXin LI static void generateParagraph(int nbSentences)
2267e509d50SXin LI {
2277e509d50SXin LI int i;
2287e509d50SXin LI for (i = 0; i < nbSentences; i++) {
2297e509d50SXin LI int wordsPerSentence = about(11);
2307e509d50SXin LI generateSentence(wordsPerSentence);
2317e509d50SXin LI }
2327e509d50SXin LI if (g_nbChars < g_maxChars) {
2337e509d50SXin LI g_ptr[g_nbChars++] = '\n';
2347e509d50SXin LI }
2357e509d50SXin LI if (g_nbChars < g_maxChars) {
2367e509d50SXin LI g_ptr[g_nbChars++] = '\n';
2377e509d50SXin LI }
2387e509d50SXin LI }
2397e509d50SXin LI
2407e509d50SXin LI /* It's "common" for lorem ipsum generators to start with the same first
2417e509d50SXin LI * pre-defined sentence */
generateFirstSentence(void)2427e509d50SXin LI static void generateFirstSentence(void)
2437e509d50SXin LI {
2447e509d50SXin LI int i;
2457e509d50SXin LI for (i = 0; i < 18; i++) {
2467e509d50SXin LI const char* word = kWords[i];
2477e509d50SXin LI const char* separator = " ";
2487e509d50SXin LI if (i == 4)
2497e509d50SXin LI separator = ", ";
2507e509d50SXin LI if (i == 7)
2517e509d50SXin LI separator = ", ";
2527e509d50SXin LI generateWord(word, separator, i == 0);
2537e509d50SXin LI }
2547e509d50SXin LI generateWord(kWords[18], ". ", 0);
2557e509d50SXin LI }
2567e509d50SXin LI
2577e509d50SXin LI size_t
LOREM_genBlock(void * buffer,size_t size,unsigned seed,int first,int fill)2587e509d50SXin LI LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
2597e509d50SXin LI {
2607e509d50SXin LI g_ptr = (char*)buffer;
2617e509d50SXin LI assert(size < INT_MAX);
2627e509d50SXin LI g_maxChars = size;
2637e509d50SXin LI g_nbChars = 0;
2647e509d50SXin LI g_randRoot = seed;
2657e509d50SXin LI if (g_distribCount == 0) {
2667e509d50SXin LI init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
2677e509d50SXin LI }
2687e509d50SXin LI
2697e509d50SXin LI if (first) {
2707e509d50SXin LI generateFirstSentence();
2717e509d50SXin LI }
2727e509d50SXin LI while (g_nbChars < g_maxChars) {
2737e509d50SXin LI int sentencePerParagraph = about(7);
2747e509d50SXin LI generateParagraph(sentencePerParagraph);
2757e509d50SXin LI if (!fill)
2767e509d50SXin LI break; /* only generate one paragraph in not-fill mode */
2777e509d50SXin LI }
2787e509d50SXin LI g_ptr = NULL;
2797e509d50SXin LI return g_nbChars;
2807e509d50SXin LI }
2817e509d50SXin LI
LOREM_genBuffer(void * buffer,size_t size,unsigned seed)2827e509d50SXin LI void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
2837e509d50SXin LI {
2847e509d50SXin LI LOREM_genBlock(buffer, size, seed, 1, 1);
2857e509d50SXin LI }
286