Hacker News new | ask | show | jobs
by gwu78 3310 days ago
This is my "Hacker News Reader". It converts HN to csv. (Only selected fields of interest to me.) From there it can easily be imported into kdb+. I have more reusable generalized lex techniques for other websites but HN is so simple it can be done via a braindead one-off as below.

Requirements: lex, cc

Usage:

   fetch -4o yc.htm https://news.ycombinator.com
   yc < yc.htm 
To compile this I use something like

    flex -Crfa -8 -i yc.l;
    cc -Wall -pipe lex.yy.c -static -o yc;
Save the text below as yc.l then compile as above.

    #define jmp BEGIN
    #define p printf
    #define x yytext
   %s aa bb cc dd ee ff gg hh
   %s ii jj kk ll mm nn oo 
   aa "span class=\"rank\""
   bb "a href=\""
   cc score_........
   dd \>
    /* #include <time.h> */
    /* #include <util.h> */
   %%
   [^\12\40-\176]
   , p("%2c");
    /* rank (dont care) */
   {aa} jmp aa;
    /* <aa>[1-9][^<\.]* p("\n%s,",x);jmp bb; */
   <aa>[1-9][^<\.]* p("\n");jmp bb; 
    /* url */
   <bb>{bb} jmp cc;
   <cc>http[^"]* p("%s,",x);jmp dd; 
    /* title */
   <dd>{dd} jmp ee;
    /* <ee>[^><]* p("%s,",x);jmp ff; */
   <ee>[^><]* p("%s",x);jmp ff;
    /* host (omit) */
    /* points (dont care) */
   <ff>{cc} jmp gg;
   <gg>{dd} jmp hh;
    /* <hh>[1-9][^<> p]* p("%s,",x);jmp ii; */
   <hh>[1-9][^<> p]* p(",");jmp ii; 
    /* user */
   <ii>{bb} jmp jj;
   <jj>http[^"]* p("%s,",x);jmp kk;
    /* time (dont care) */
   <kk>{bb} jmp ll;
    /* <ll>http[^"]* ; */
   <ll>http[^"]* jmp mm;
    /* unix time (dont care) */
    /* <ll>[1-9][^<]* { 
    time_t t0; time_t t1; time_t t2;
    t1=time(&t0);
    t2=parsedate(x,&t1,0);
    p("%d,",t2); 
    jmp mm;
    } */
    /* item */
   <mm>{bb} jmp nn;
    /* <nn>http[^"]* p("%s,",x);jmp oo; */
   <nn>http[^"]* p("%s",x);jmp oo;
    /* comments (dont care) */
   <oo>{dd} jmp oo;
    /* <oo>[1-9d][^ <]* p("%s",x);jmp 0; */
   <oo>[1-9d][^ <]* jmp 0;
   .
   \n
   %%
   main(){ yylex();}
   yywrap(){ p("\n");}
2 comments

And I thought using regex for parsing HTML was bad.