diff --git a/.gitignore b/.gitignore index 2599fc8564960b07c40823842ea020435d4c0403..383975bde534715323d4a3932c4354ef5782559f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ ./nomes-de-bebes/ -./PubMed/ -./Sinonimos/ +./tshell/temp/ diff --git a/tshell/tshell_p3.sh b/tshell/tshell_p3.sh index f1dbb902f910282651b4f2ffacf95f7beedc0980..835607884c380c36ed0758288b0c17d5fca398fd 100755 --- a/tshell/tshell_p3.sh +++ b/tshell/tshell_p3.sh @@ -35,7 +35,24 @@ CSV=$2 confere_arq_saida $CSV CSV=$(realpath $CSV) -# separar os campos necessários do arquivo xml.gz -# zcat $XML | xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" - - +# separar os campos úteis do arquivo xml.gz +zcat $XML | \ + xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" | \ + # o sed acha um campo PMID, seguido de um campo ArticleTitle, seguido + # de um campo Abstract, e imprime. Se houver um campo MeshHeadingList + # após o Abstract, imprime também. + sed -En ' + /<PMID/{ + N; + /<ArticleTitle>/{ + N; + /<Abstract>/{ + p; + n; + /<MeshHeadingList>/{ + p; + } + } + } + }' >> $CSV +# grep 'PMID' ARQUIVO_COM_CAMPOS_SEPARADOS | awk '{ print substr($0, 16, length($0)-22) }' | sed 's/">//'