Robert Klemme wrote:
> I once made a script that takes a list of strings and creates a
> single regexp from them that is supposedly matched fast (i.e. doesn't need
> backtracking). If you're interested I can check at work tomorrow - I'm
> quite sure I still have it somewhere (please email as reminder).
Is this a trie optimization? I've written something similar for
Regexp::English but the optimization itself is quite slow and takes up
lots of resources. (Because it needs to build the trie.)
It can do this:
irb(main):006:0> re
=> /foobar|foobark|foobard|foobarqux|food|fool|foolish/
irb(main):007:0> re.optimize
=> /foo(?:l(?:ish)?|bar(?:qux|[kd])?|d)/
It was able to create an optimized Regexp that matches the first few
hundred English words from a words file -- I've attached it.
> Kind regards
> robert
More regards,
Florian Gross
/a(?:ki(?:mbo|n)|l(?:veol(?:ar|i|us)|k(?:al(?

ids? |i(?:ne|s)?)|yl)|ways|l(?:ay(?:ed|s|ing)?|y(?:ing) ?|o(?:w(?:a(?:bl(?:[ye])|nces?)|ed|s|ing)?|ys?|cat(?:able|ors?|e(?:[ds])?|i(?:ng|ons?))|phon(?:es?|ic)|t(?:ments?|s|t(?:e (?:[dr])|ing))?)|e(?:viat(?:e(?:[ds])?|i(?:ng|on))|les?|mande|y(?:ways?|s)?|rg(?:y|i(? :c|es))|g(?:ations?|or(?:y|i(?:c(?:al(?:ly)?)?|es) )|e(?:d(?:ly)?|s)?|rettos?|i(?:ances?|ng)))|i(?:an ces?|e(?:[ds])|gators?|terati(?:ve|ons?))|u(?:d(?:e(?:[ds])?|ing)|r(?:e(?:ment)?|ing)|si(?:ve(?:ness)?|ons?) ))?|a(?:baster|crity|rm(?:ed|s|i(?:ng(?:ly)?|st))? |s)|m(?:a(?:nacs?)?|o(?:n(?:ds?|er)|st)|s(?:man)?| ighty)|b(?:a(?:core|tross)?|eit|um(?:s|in)?)|nico| c(?

(?:ves?|hol(?:s|i(?:cs?|sm))?)|hemy)|o(?:n(?: e(?:ness)?|g(?:side)?)|of(?:ness)?|es?|ft|ha|ud)|d er(?:m(?:an|en))?|p(?:ha(?:bet(?:s|i(?:c(?:al(?:ly )?|s)?|z(?:e(?:[ds])?|ing)))?|numeric)?|ine)|e(?:e|rt(?:ly|ness|e(?:d (?:ly)?|rs?)|s|ing)?)?|f(?:alfa|resco)|ready|g(?:a (?:e(?:cide)?)?|orithm(?:s|ic(?:ally)?)?|ebra(?:s| ic(?:ally)?)?|inate)|so|t(?:ars?|ogether|er(?:a(?: ble|tions?)|nat(?

rs?|e(?:[ds]|ly)?|i(?:ve(?:ly|s)?|ng|ons?))|cations?|e(?:d|rs? )|s|ing)?|ruis(?:m|t(?:ic(?:ally)?)?)|hough|itudes ?)|i(?:ve|ke|as(?:e(?:[ds])|ing)?|m(?

ny|ents?)|bis?|en(?:at(?:e(?:[ds])?|i(?:ng|on))|s)?|g(?:n(?:ments?|ed|s|ing)?|ht))| u(?:m(?:n(?:ae?|i|us)|inum)?|ndum))|m(?:a(?:lgam(? :at(?:e(?:[ds])?|i(?:ng|on))|s)?|nuensis|z(?:e(?:ment|d(?:ly)?|r s?|s)?|ing(?:ly)?)|retto|ss(?:e(?:[ds])|ing)?|t(?

ry|eur(?:s|is(?:m|h(?:ness)?))?)|in)| m(?

(?:ni(?:ac?|um))?|unition)|b(?:l(?:e(?:[drs])?|ing)|assadors?|er|rosial|i(?:valen(?:ce|t(?:ly) ?)|ance|dextrous(?:ly)?|ent|gu(?

us(?:ly)?|it(?:y |ies))|tio(?:ns?|us(?:ly)?))|u(?:la(?:nces?|tory)| s(?:cade|h(?:e(?:[ds]))?)))|yl|nesty|o(?:k|ng(?:st)?|eba(?:[es])?|r(?:al(?:ity)?|ous|phous(?:ly)?|tiz(?:e(?:[ds])?|ing)|ist)|u(?:nt(?:e(?:d|rs?)|s|ing)?|r))|p(?:l (?:y|e|i(?:f(?:y(?:ing)?|i(?:cation|e(?:d|rs?|s))) |tudes?))|oules?|er(?:age|es?|sands?)|h(?:etamines ?|i(?:b(?

logy|i(?:ans?|ous(?:ly)?))|theaters?))| utat(?:e(?:[ds])?|ing))|e(?:liorat(?:ed?|i(?:ng|on))|n(?:able|orr hea|d(?:ments?|ed|s|ing)?|it(?:y|ies))?|ricium)|i( ?:able|no|cabl(?:[ye])|d(?:e|st)?|go|ss|ty)|u(?:lets?|s(?:e(?:ments?|d( ?:ly)?|rs?|s)?|ing(?:ly)?)))?|b(?:l(?:a(?:ze|t(?:e (?:[ds])?|i(?:ve|ng|on)))|y|e(?:r|st)?)|a(?:ndon(?:ment|e d|s|ing)?|ck|ft|s(?:e(?:ments?|d|s)?|h(?:e(?:[ds])|ing)?|ing)|t(?:e(?:ments?|d|r|s)?|ing))|b(?

ts? |e(?:ys?)?|reviat(?:e(?:[ds])?|i(?:ng|ons?)))|ys(?:mal(?:ly)?|s(?:es)?)|normal (?:ly|it(?:y|ies))?|o(?:ve(?:mentioned|board|groun d)?|li(?:sh(?:ments?|e(?:d|rs?|s)|ing)?|tion(?:ist s?)?)|ard|mina(?:ble|te)|des?|r(?:t(?:ed|s|i(?:ve( ?:ly)?|ng|ons?))?|igin(?:al|es?))|u(?:nd(?:ed|s|in g)?|t))|d(?

m(?:ens?|inal)|uct(?

rs?|ed|s|ions?) ?)|e(?:yance|d|rra(?:nt|tions?)|t(?:s|t(?:e(?:[dr])|ing))?)|r(?:a(?:d(?:e(?:[ds])?|ing)|si(?:ve|ons?))|o(?:ad|gat(?:e(?:[ds])?|ing))|ea(?:ctions?|st)|idg(?:ment|e(?:[ds])?|ing)|upt(?:ly|ness)?)|s(?:c(?

nd(?:ed|s|ing)?| ess(?:e(?:[ds]))?|issas?)|o(?:l(?:v(?:e(?:[ds])?|ing)|ut(?:e(?:ly|ness|s)?|ion))|r(?:b(?:e(?:n(? :cy|t)|d|r)|s|ing)?|pti(?:ve|ons?)))|en(?:ces?|t(? :ly|minded|e(?:d|e(?:s|ism)?)|s|i(?:a|ng))?)|t(?:a in(?:e(?:[dr])|s|ing)?|entions?|r(?:act(?:ly|ness|ors?|ed|s|i(? :ng|on(?:s|is(?:[mt]))?))?|use(?:ness)?)|inence)|inthe|urd(?:ly|it(?:y |ies))?)|hor(?:r(?:e(?:[dr]|nt)|ing)|s)?|i(?:lit(?:y|ies)|d(?:e(?:[ds])?|ing))|u(?:ndan(?:ce|t(?:ly)?)|s(?:e(?:[ds])?|i(?:ve|ng))|t(?:ment|s|t(?:e(?:d|rs?)|ing))?)|j (?:ect(?:ly|ness|ions?)?|ur(?:e(?:[ds])?|ing)))|n(?:kles?|a(?:l(?:y(?:z(?:able|e(?:d|rs? |s)?|ing)|s(?:es|ts?|is)|tic(?:al(?:ly)?|it(?:y|ie s))?)|og(?:y|ous(?:ly)?|i(?:cal|es)|ues?)?)?|c(?

ndas?|hronis(?:ms?|tically))|p(?:lasmosis|hor(?:a| ic(?:ally)?))|erobic|rch(?:y|i(?:c(?:al)?|s(?:m|ts ?)))|grams?|stomo(?:s(?:es|is)|tic)|t(?

m(?:y|ic( ?:al(?:ly)?)?)|hema))|n(?:als?|o(?:y(?:ances?|e(?: d|rs?)|s|ing(?:ly)?)?|tat(?:e(?:[ds])?|i(?:ng|ons?))|unc(?:e(?:ments?|d|rs?|s)?|ing))| ex(?:ation|e(?:[ds])|ing)?|i(?:versar(?:y|ies)|hilat(?:e(?:[ds])?|i(?:ng|on)))|ual(?:ly|s)?)|c(?:est(?

rs?|r(?:a l|y))|ho(?:v(?:y|ies)|r(?:ages?|ed|s|i(?:ng|t(?:e| ism)))?)|i(?:llary|ent(?:ly|s)?))|d(?:ers|ing)?|e( ?:w|m(?

(?:met(?:ers?|ry)|ne)|i(?:[ac]))|c(?:dot(?:al|es?)|hoic)|sthe(?:sia|ti(?:c(?:all y|s)?|z(?:e(?:[ds])?|ing))))|g(?:l(?:e(?:d|rs?)?|ing)|e(?:l(?:s|ic)? |r(?:ed|s|ing)?)|r(?:y|i(?:ly|e(?:r|st)))|st(?:rom )?|iography|u(?:lar(?:ly)?|ish(?:ed)?))|hydrous(?: ly)?|i(?:line|m(?:a(?:ls?|t(?

rs?|e(?:ly|ness|d(? :ly)?|s)?|i(?:ng|ons?)))|osity|i(?:zed|sm))|on(?:s |ic)?|s(?

trop(?:y|ic)|e(?:ikonic)?)))?|c(?:knowl edg(?:ments?|e(?:able|ments?|d|rs?|s)?|ing)|a(?:ci a|dem(?:y|i(?:a|c(?:ally|s)?|es)))|me|yclic(?:ally )?|ne|c(?:l(?:a(?:mation|im(?:ed|s|ing)?)|imat(?:e (?:[ds])?|i(?:ng|z(?:ation|ed))))|o(?:lades?|m(?:modat(?: e(?:[ds])?|i(?:ng|ons?))|p(?:li(?:ces?|sh(?:ments?|e(?:d|r s?|s)|ing)?)|an(?:y(?:ing)?|i(?:ments?|e(?:[ds])|sts?))))|rd(?:ance|e(?:d|rs?)|s|i(?:ng(?:ly)?|on s?))?|st(?:ed|s|ing)?|unt(?:a(?:b(?:l(?:[ye])|ility)|n(?:cy|ts?))|ed|s|ing)?)|e(?:ler(?:at(?

rs?|e(?:[ds])?|i(?:ng|ons?))|ometers?)|nt(?:ed|s|ing|ua(?:l|t( ?:e(?:[ds])?|i(?:ng|on))))?|de(?:[ds])?|pt(?:a(?:b(?:l(?:[ye])|ility)|nces?)|ors?|e(?:d|rs?)|s|ing)?|ss(?

r(?:[ys]|ies)|e(?:[ds])|i(?:b(?:l(?:[ye])|ility)|ng|ons?))?)|r(?:e(?:dit(?:ations?|ed)?|ti ons?)|u(?:e(?:[ds])?|ing))|ident(?:ly|al(?:ly)?|s)?|u(?:lturat(?:e(? :[ds])?|i(?:ng|on))|mulat(?

rs?|e(?:[ds])?|i(?:ng|ons?))|r(?:a(?:c(?:y|ies)|te(?:ly|ness)? )|sed)|s(?:a(?:l|ti(?:ve|ons?))|e(?:[drs])?|tom(?:ed|s|ing)?|ing(?:ly)?)))|o(?:lytes?|rns?| ustic(?:al(?:ly)?|s|ian)?)|e(?:s|t(?:ate|ylene|one ))?|qu(?:aint(?:ances?|ed|s|ing)?|i(?:esc(?:e(?:n( ?:ce|t)|d|s)?|ing)|r(?:able|e(?:[ds])?|ing)|siti(?:ve(?:ness)?|ons?)|t(?:s|t(?:al|e(?:[dr])|ing))?))|r(?:ylic|o(?:bat(?:s|ics?)?|nyms?|polis |ss)|e(?:age|s)?|i(?:mon(?:y|ious)|d))|h(?:e(?:[ds])?|i(?:ng|ev(?:able|e(?:ments?|d|rs?|s)?|ing)))|t( ?

rs?|ed|ress(?:es)?|i(?:v(?:at(?

rs?|e(?:[ds])?|i(?:ng|ons?))|e(?:ly)?|i(?:s(?:m|ts?)|t(?:y|ies )))|n(?

meters?|g|ium)|ons?)|ua(?:l(?:ly|s|i(?:za tion|t(?:y|ies)))?|rial(?:ly)?|t(?

rs?|e(?:[ds])?|ing)))?|id(?:ly|s|i(?:c|t(?:y|ies))|ulous)?|u(? :men|te(?:ly|ness)?|ity))|d(?:v(?:an(?:c(?:e(?:men ts?|d|s)?|ing)|tage(?

us(?:ly)?|d|s)?)|oca(?:cy|t (?:e(?:[ds])?|ing))|e(?:nt(?:i(?:sts?|tious)|ur(?

us|e(?:d|r s?|s)?|ing))?|r(?:b(?:s|ial)?|s(?:ar(?:y|ies)|e(?: ly)?|it(?:y|ies))|t(?:is(?:e(?:ments?|d|rs?|s)?|in g))?))|i(?:ce|s(?:ab(?:l(?:[ye])|ility)|or(?:[ys])?|e(?:ments?|d(?:ly)?|es?|rs?|s)?|ing)))|a(?:mant (?:ly)?|pt(?:a(?:b(?:le|ility)|tions?)|ors?|e(?:d| rs?)|s|i(?:ve(?:ly)?|ng))?|g(?:es?|ios?))|m(?

ni( ?:sh(?:ments?|e(?:[ds])|ing)?|tions?)|i(?

(?:e(?:[ds])|ture)?|nist(?:er(?:ed|s|ings?)?|ra(?:ble|t(?

rs ?|e|i(?:ve(?:ly)?|ons?))))|r(?:a(?:l(?:s|ty)?|bl(? :[ye])|tions?)|e(?:d|rs?|s)?|ing(?:ly)?)|ssi(?:b(?:le|i lity)|ons?)|t(?:s|t(?:ance|e(?:d(?:ly)?|rs?)|ing)) ?))|o(?:lescen(?:ce|ts?)|be|pt(?:e(?:d|rs?)|s|i(?: ve|ng|ons?))?|r(?:a(?:ble|tion)|n(?:ments?|ed|s)?| e(?:[ds])?))?|d(?:e(?:nd(?:a|um)?|d|rs?)|ress(?:ab(?:le|il ity)|e(?:d|es?|rs?|s)|ing)?|s|i(?:ng|ct(?:ed|s|i(? :ng|ons?))?|ti(?:v(?:es?|ity)|on(?:al(?:ly)?|s)?)) |uc(?:e(?:[ds])?|t(?

r|ed|s|i(?:ng|on))?|i(?:ble|ng)))?|e(?

t| qua(?:c(?:y|ies)|te(?:ly)?))|r(?

it(?:ness)?|enal (?:ine)?|ift)|s(?

r(?:b(?:ed|s|ing)?|ption))?|he( ?:r(?:e(?:n(?:ce|ts?)|d|rs?|s)?|ing)|si(?:ves?|ons ?))|i(?:abatic(?:ally)?|eu)|u(?:l(?:at(?:e|i(?:ng| on))|t(?:er(?:at(?:e(?:[ds])?|ing)|y|ous(?:ly)?|ers?)|s|hood)?)|mbrat(?:e(?:[ds])?|i(?:ng|on)))|j(?:acen(?:cy|t)|o(?:in(?:ed|s|ing )?|urn(?:ment|ed|s|ing)?)|ectives?|u(?:ncts?|d(?:g (?:e(?:[ds])?|ing)|icat(?:e(?:[ds])?|i(?:ng|ons?)))|r(?:e(?:[ds])?|ing)|st(?:abl(?:[ye])|ments?|ors?|e(?:d|rs?)|s|ing)?|tants?)))?|e(?:r( ?:at(?

rs?|e(?:[ds])?|i(?:ng|on))|o(?:acoustic|bics?|nautic(?:al|s)?| dynamics?|s(?

l(?:s|ize)?|pace))|ials?)|gis|sthet ic(?:ally|s)?)|f(?:l(?:ame|oat)|ar|o(?

t|re(?:men tioned|said|thought)?|ul)|f(?:l(?:ict(?:ed|s|i(?:v e|ng|ons?))?|uen(?:ce|t))|a(?:ble|irs?)|ord(?:able |ed|s|ing)?|e(?:ct(?:ations?|ed|s|i(?:ve|ng(?:ly)? |on(?:ate(?:ly)?|s)?))?|rent)|r(?

nt(?:ed|s|ing)? |i(?:cates?|ght))|i(?:liat(?:e(?:[ds])?|i(?:ng|ons?))|anced|x(?:e(?:[ds])|ing)?|nit(?:y|ies)|davits?|rm(?:ati(?:ve(?:ly)?| ons?)|ed|s|ing)?))|r(?:aid|esh)|t(?:er(?:wards?|li fe|m(?:ath|ost)|noons?|effect|glow|shocks?|thought s?|image)?)?|i(?:cionado|eld|re))|g(?:l(?

w|eam)| a(?

e|r|tes?|in(?:st)?)|nostics?|o(?:n(?:y|i(?:z( ?:e(?:[ds])?|ing(?:ly)?)|es))|g)?|e(?:less|n(?:c(?:y|ies)|da s?|ts?)|d|rs?|s)?|r(?:arian|ee(?:abl(?:[ye])|ments?|d|rs?|s|ing)?|icultur(?:al(?:ly)?|e))|g(? :l(?

merat(?:e(?:[ds])?|ion)|utin(?:at(?:e(?:[ds])?|i(?:ng|on))|ins?))|r(?:a(?:vat(?:e(?:[ds])?|ion)|ndize)|e(?:gat(?:e(?:[ds]|ly)?|i(?:ng|ons?))|ss(?

rs?|i(?:ve(?:ly|ness)?|o ns?)))|iev(?:e(?:[ds])?|ing)))|hast|i(?:l(?:e(?:ly)?|ity)|ng|tat(?

rs? |e(?:[ds])?|i(?:ng|ons?)))|ue)|h(?:ead)?|i(?:l(?:ments?|ero ns?|ing)?|m(?:less(?:ly)?|e(?:d|rs?)|s|ing)?|d(?:e d?|s|ing)?|r(?:ways?|l(?

cks?|ess|i(?:ne(?:[rs])?|fts?))|m(?:a(?:n|ils?)|en)|b(?:ags?|orne)|y|cra ft|drops?|p(?:lanes?|orts?)|e(?:d|rs?)|f(?:low|are |oils?|rames?|ields?)|s(?

(?:ace|eed)|hips?|trips ?)?|tight|i(?:ly|ngs?))?|sle)|jar)|A(?:k(?:ers|ron )|LGOL|l(?:v(?:a(?:rez)?|in)|l(?:a(?:[nh])|yn|e(?:n(?:dale|town)?|g(?:ra|hen(?:y|ies)))|sta te|is(?

n)?)|a(?:m(?

s?|eda)|bam(?:a(?:ns)?|ian) |n|ddin|r|s(?:kan?|tair))|maden|b(?:an(?:y|ia(?:ns ?)?)|er(?:t(?:[ao])?|ich)|r(?:echt|ight)|uquerque)|yssa|c(?:mena|o(? :a|tt)|estis|ibiades)|d(?:e(?:baran|n)|rich)|p(?:e rt|s|h(?

nse|eratz))|e(?

(?:and(?:er|r(?:a|e|i(? :a|ne)))|ei|is)?|ck?|ut(?:ian)?)|f(?:a|onso|redo?) |g(?

(?:l|nqui(?:an|n))|e(?:nib|r(?:ian?)?)|iers) |s(?:atians?|op)|hambra|t(?:air|o(?:[ns])|haea)|i(?:c(?:e|ia)|s(?

n|tair))?)?|ar(?

n|hus )|m(?:a(?:nda|zons?|deus|rillo)|m(?:an|erman)|y|o( ?:ntillado|co|s)|dahl|pex|e(?:lia|r(?:ada|ica(?:n( ?:a|s|i(?:z(?:ations?|e(?:rs?|s)?)|sm))?|s)?)|s)|s terdam|h(?:aric|erst)|trak|iga)|b(?:aba|b(?:[ay]|ott)|yssinia(?:ns?)?|ner|os?|e(?:l(?:son|ian)?|r( ?:nathy|deen))?|ra(?:m(?:s(?

n)?)?|ham)|i(?:lene| djan|gail)|u)|n(?:kara|a(?:lects|b(?:aptists?|el)| creon|stasia|heim|tol(?:e|ian?))|n(?:a(?:list(?:ic )?|polis)?|e(?:tte)?|ie)?|d(?:alusia(?:ns?)?|y|o(? :ver|rra)|e(?:an|rs(?

n|en)|s)|r(?

m(?:ache|eda) |e(?:ws?|a|i)?))|g(?:l(?

(?

h(?

bia|ilia))?|es| i(?:a|can(?:s|i(?:zes?|sm))?))|o(?:la|ra)|el(?:a|o |e(?:nos?|s)|i(?:n(?:[ae])|ca))|ie|us)|heuser|ita)|c(?:k(?:ley|erman)|a(?:d ia|pulco)|cra|h(?:aeans?|illes)|t(?:a(?:eon)?|on|s ))|d(?:kins|ler(?:ian)?|a(?:m(?:s(?

n)?)?|ir)?|o( ?:lph(?:us)?|nis)|d(?:ressograph|is(?

n)?)|e(?:l( ?:aide|e|ia)|n)|ri(?:a(?:n|tic)|enne)|irondacks?)| e(?:ne(?:as|id)|olus|robacter|gean|s(?:chylus|op)) |f(?:ri(?:ka(?:ans|ners?)|ca(?:n(?:s|iz(?:ations?| e(?:[ds])?|ing))?)?)|ghan(?:s|istan)?)|g(?:way|a(?:memnon| tha)|ne(?:[ws])|ee|ricola|gies?)|hm(?:adabad|edabad)|i(?:ken|lee n|nus?|d(?:a|es)|r(?:bus|e(?:dale|s))|tken)|jax)/