From e2392d917a60d5ef253dc35669a81b80154401ce Mon Sep 17 00:00:00 2001 From: Pedro Folloni Pesserl <fpesserl7@gmail.com> Date: Fri, 24 Feb 2023 10:32:21 -0300 Subject: [PATCH] tshell: p3: add sed command --- .gitignore | 3 +-- tshell/tshell_p3.sh | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 2599fc8..383975b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ ./nomes-de-bebes/ -./PubMed/ -./Sinonimos/ +./tshell/temp/ diff --git a/tshell/tshell_p3.sh b/tshell/tshell_p3.sh index f1dbb90..8356078 100755 --- a/tshell/tshell_p3.sh +++ b/tshell/tshell_p3.sh @@ -35,7 +35,24 @@ CSV=$2 confere_arq_saida $CSV CSV=$(realpath $CSV) -# separar os campos necessários do arquivo xml.gz -# zcat $XML | xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" - - +# separar os campos úteis do arquivo xml.gz +zcat $XML | \ + xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" | \ + # o sed acha um campo PMID, seguido de um campo ArticleTitle, seguido + # de um campo Abstract, e imprime. Se houver um campo MeshHeadingList + # após o Abstract, imprime também. + sed -En ' + /<PMID/{ + N; + /<ArticleTitle>/{ + N; + /<Abstract>/{ + p; + n; + /<MeshHeadingList>/{ + p; + } + } + } + }' >> $CSV +# grep 'PMID' ARQUIVO_COM_CAMPOS_SEPARADOS | awk '{ print substr($0, 16, length($0)-22) }' | sed 's/">//' -- GitLab