Hacker News new | ask | show | jobs
by textmode 898 days ago
Correction:

      /* remove HTTP headers from multiple gzip or single zip from stdin */
    
     int fileno (FILE *);
     int setenv (const char *, const char *, int);
     #define jmp (yy_start) = 1 + 2 *
     int x;
    %option nounput noinput noyywrap
    %%
    HTTP\/[\40-\176]+\x0d\x0a x++;
    [\40-\176]+:[\40-\176]+\r\n if(!x)fwrite(yytext,1,yyleng,yyout);
    \x0D\x0A if(!x)fwrite(yytext,1,yyleng,yyout);x=0;
    %%
    int main()
    { 
    yylex();
    exit(0);
    }

Usage example:

Retrieve hostnames, IP addresses and (if available) sitemap URLs from latest Common Crawl.

     ftp -4 https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-50/robotstxt.paths.gz # <-- 180K
     gzip -dc robotstxt.paths.gz \
     |head -5 \
     |sed 's>.*>GET /& HTTP/1.1[]Host: data.commoncrawl.org[]Connection: >;
           $!s/$/keep-alive[]/;$s/$/close[]/' \
     |tr [] '\r\n' \
     |openssl s_client -quiet -connect data.commoncrawl.org:443 \
     |yy054 \
     |zegrep -a '(^Sitemap:)|(^Host:)|(^WARC-Target-URI:)|(^WARC-IP-Address:)' > 1.txt
     exec cat 1.txt