| HN Mirror

Y	Hacker News new \| ask \| show \| jobs

by gwu78 3358 days ago

This is my "Hacker News Reader". It converts HN to csv. (Only selected fields of interest to me.) From there it can easily be imported into kdb+. I have more reusable generalized lex techniques for other websites but HN is so simple it can be done via a braindead one-off as below.

Requirements: lex, cc

Usage:

   fetch -4o yc.htm https://news.ycombinator.com
   yc < yc.htm

To compile this I use something like

    flex -Crfa -8 -i yc.l;
    cc -Wall -pipe lex.yy.c -static -o yc;

Save the text below as yc.l then compile as above.

    #define jmp BEGIN
    #define p printf
    #define x yytext
   %s aa bb cc dd ee ff gg hh
   %s ii jj kk ll mm nn oo 
   aa "span class=\"rank\""
   bb "a href=\""
   cc score_........
   dd \>
    /* #include <time.h> */
    /* #include <util.h> */
   %%
   [^\12\40-\176]
   , p("%2c");
    /* rank (dont care) */
   {aa} jmp aa;
    /* <aa>[1-9][^<\.]* p("\n%s,",x);jmp bb; */
   <aa>[1-9][^<\.]* p("\n");jmp bb; 
    /* url */
   <bb>{bb} jmp cc;
   <cc>http[^"]* p("%s,",x);jmp dd; 
    /* title */
   <dd>{dd} jmp ee;
    /* <ee>[^><]* p("%s,",x);jmp ff; */
   <ee>[^><]* p("%s",x);jmp ff;
    /* host (omit) */
    /* points (dont care) */
   <ff>{cc} jmp gg;
   <gg>{dd} jmp hh;
    /* <hh>[1-9][^<> p]* p("%s,",x);jmp ii; */
   <hh>[1-9][^<> p]* p(",");jmp ii; 
    /* user */
   <ii>{bb} jmp jj;
   <jj>http[^"]* p("%s,",x);jmp kk;
    /* time (dont care) */
   <kk>{bb} jmp ll;
    /* <ll>http[^"]* ; */
   <ll>http[^"]* jmp mm;
    /* unix time (dont care) */
    /* <ll>[1-9][^<]* { 
    time_t t0; time_t t1; time_t t2;
    t1=time(&t0);
    t2=parsedate(x,&t1,0);
    p("%d,",t2); 
    jmp mm;
    } */
    /* item */
   <mm>{bb} jmp nn;
    /* <nn>http[^"]* p("%s,",x);jmp oo; */
   <nn>http[^"]* p("%s",x);jmp oo;
    /* comments (dont care) */
   <oo>{dd} jmp oo;
    /* <oo>[1-9d][^ <]* p("%s",x);jmp 0; */
   <oo>[1-9d][^ <]* jmp 0;
   .
   \n
   %%
   main(){ yylex();}
   yywrap(){ p("\n");}

2 comments

dchest 3357 days ago

There's an API: https://github.com/HackerNews/API

link

deno 3357 days ago

And I thought using regex for parsing HTML was bad.

link